2 # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. Rights for redistribution and usage in source and binary
13 # forms are granted according to the OpenSSL license.
14 # ====================================================================
16 # sha256/512_block procedure for x86_64.
18 # 40% improvement over compiler-generated code on Opteron. On EM64T
19 # sha256 was observed to run >80% faster and sha512 - >40%. No magical
20 # tricks, just straight implementation... I really wonder why gcc
21 # [being armed with inline assembler] fails to generate as fast code.
22 # The only thing which is cool about this module is that it's very
23 # same instruction sequence used for both SHA-256 and SHA-512. In
24 # former case the instructions operate on 32-bit operands, while in
25 # latter - on 64-bit ones. All I had to do is to get one flavor right,
26 # the other one passed the test right away:-)
28 # sha256_block runs in ~1005 cycles on Opteron, which gives you
29 # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
30 # frequency in GHz. sha512_block runs in ~1275 cycles, which results
31 # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
32 # Well, if you compare it to IA-64 implementation, which maintains
33 # X[16] in register bank[!], tends to 4 instructions per CPU clock
34 # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
35 # issue Opteron pipeline and X[16] maintained in memory. So that *if*
36 # there is a way to improve it, *then* the only way would be to try to
37 # offload X[16] updates to SSE unit, but that would require "deeper"
38 # loop unroll, which in turn would naturally cause size blow-up, not
39 # to mention increased complexity! And once again, only *if* it's
40 # actually possible to noticeably improve overall ILP, instruction
41 # level parallelism, on a given CPU implementation in this case.
43 # Special note on Intel EM64T. While Opteron CPU exhibits perfect
44 # performance ratio of 1.5 between 64- and 32-bit flavors [see above],
45 # [currently available] EM64T CPUs apparently are far from it. On the
46 # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
47 # sha256_block:-( This is presumably because 64-bit shifts/rotates
48 # apparently are not atomic instructions, but implemented in microcode.
52 # Optimization including one of Pavel Semjanov's ideas, alternative
53 # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
54 # unfortunately -2% SHA512 on P4 [which nobody should care about
59 # Add SIMD code paths, see below for improvement coefficients. SSSE3
60 # code path was not attempted for SHA512, because improvement is not
61 # estimated to be high enough, noticeably less than 9%, to justify
62 # the effort, not on pre-AVX processors. [Obviously with exclusion
63 # for VIA Nano, but it has SHA512 instruction that is faster and
64 # should be used instead.] For reference, corresponding estimated
65 # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
66 # higher coefficients are observed on VIA Nano and Bulldozer has more
67 # to do with specifics of their architecture [which is topic for
68 # separate discussion].
72 # Add AVX2 code path. Two consecutive input blocks are loaded to
73 # 256-bit %ymm registers, with data from first block to least
74 # significant 128-bit halves and data from second to most significant.
75 # The data is then processed with same SIMD instruction sequence as
76 # for AVX, but with %ymm as operands. Side effect is increased stack
77 # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
82 # Add support for Intel SHA Extensions.
84 ######################################################################
85 # Current performance in cycles per processed byte (less is better):
87 # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
89 # AMD K8 14.9 - - 9.57 -
91 # Core 2 15.6 13.8(+13%) - 9.97 -
92 # Westmere 14.8 12.3(+19%) - 9.58 -
93 # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
94 # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
95 # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
96 # Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%)
97 # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
98 # VIA Nano 23.0 16.5(+39%) - 14.7 -
99 # Atom 23.0 18.9(+22%) - 14.7 -
100 # Silvermont 27.4 20.6(+33%) - 17.5 -
102 # (*) whichever best applicable;
103 # (**) switch from ror to shrd stands for fair share of improvement;
104 # (***) execution time is fully determined by remaining integer-only
105 # part, body_00_15; reducing the amount of SIMD instructions
106 # below certain limit makes no difference/sense; to conserve
107 # space SHA256 XOP code path is therefore omitted;
111 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
113 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
115 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
116 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
117 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
118 die "can't locate x86_64-xlate.pl";
120 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
121 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
122 $avx = ($1>=2.19) + ($1>=2.22);
125 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
126 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
127 $avx = ($1>=2.09) + ($1>=2.10);
130 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
131 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
132 $avx = ($1>=10) + ($1>=11);
135 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
136 $avx = ($2>=3.0) + ($2>3.0);
139 $shaext=1; ### set to zero if compiling for 1.0.1
140 $avx=1 if (!$shaext && $avx);
142 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
145 if ($output =~ /512/) {
146 $func="sha512_block_data_order";
149 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
150 "%r8", "%r9", "%r10","%r11");
151 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
158 $func="sha256_block_data_order";
161 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
162 "%r8d","%r9d","%r10d","%r11d");
163 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
171 $ctx="%rdi"; # 1st arg, zapped by $a3
172 $inp="%rsi"; # 2nd arg
175 $_ctx="16*$SZ+0*8(%rsp)";
176 $_inp="16*$SZ+1*8(%rsp)";
177 $_end="16*$SZ+2*8(%rsp)";
178 $_rsp="16*$SZ+3*8(%rsp)";
179 $framesz="16*$SZ+4*8";
183 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
185 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
188 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
192 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
195 mov $T1,`$SZ*($i&0xf)`(%rsp)
199 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
201 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
203 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
205 add $a2,$T1 # T1+=Ch(e,f,g)
208 add ($Tbl),$T1 # T1+=K[round]
211 xor $b,$a2 # a^b, b^c in next round
212 ror \$$Sigma1[0],$a0 # Sigma1(e)
216 ror \$$Sigma0[0],$a1 # Sigma0(a)
217 add $a0,$T1 # T1+=Sigma1(e)
219 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
223 lea $STRIDE($Tbl),$Tbl # round++
225 $code.=<<___ if ($i<15);
226 add $a1,$h # h+=Sigma0(a)
228 ($a2,$a3) = ($a3,$a2);
232 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
235 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
236 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
239 ror \$`$sigma0[1]-$sigma0[0]`,$a0
240 add $a1,$a # modulo-scheduled h+=Sigma0(a)
242 ror \$`$sigma1[1]-$sigma1[0]`,$a2
251 xor $a0,$T1 # sigma0(X[(i+1)&0xf])
252 xor $a1,$a2 # sigma1(X[(i+14)&0xf])
253 add `$SZ*(($i+9)&0xf)`(%rsp),$T1
255 add `$SZ*($i&0xf)`(%rsp),$T1
266 .extern OPENSSL_ia32cap_P
268 .type $func,\@function,3
272 $code.=<<___ if ($SZ==4 || $avx);
273 lea OPENSSL_ia32cap_P(%rip),%r11
278 $code.=<<___ if ($SZ==4 && $shaext);
279 test \$`1<<29`,%r11d # check for SHA
282 $code.=<<___ if ($avx && $SZ==8);
283 test \$`1<<11`,%r10d # check for XOP
286 $code.=<<___ if ($avx>1);
287 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
288 cmp \$`1<<8|1<<5|1<<3`,%r11d
291 $code.=<<___ if ($avx);
292 and \$`1<<30`,%r9d # mask "Intel CPU" bit
293 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
295 cmp \$`1<<28|1<<9|1<<30`,%r10d
298 $code.=<<___ if ($SZ==4);
309 mov %rsp,%r11 # copy %rsp
310 shl \$4,%rdx # num*16
312 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
313 and \$-64,%rsp # align stack frame
314 mov $ctx,$_ctx # save ctx, 1st arg
315 mov $inp,$_inp # save inp, 2nd arh
316 mov %rdx,$_end # save end pointer, "3rd" arg
317 mov %r11,$_rsp # save copy of %rsp
333 lea $TABLE(%rip),$Tbl
336 for($i=0;$i<16;$i++) {
337 $code.=" mov $SZ*$i($inp),$T1\n";
338 $code.=" mov @ROT[4],$a0\n";
339 $code.=" mov @ROT[0],$a1\n";
340 $code.=" bswap $T1\n";
341 &ROUND_00_15($i,@ROT);
342 unshift(@ROT,pop(@ROT));
350 &ROUND_16_XX($i,@ROT);
351 unshift(@ROT,pop(@ROT));
355 cmpb \$0,`$SZ-1`($Tbl)
359 add $a1,$A # modulo-scheduled h+=Sigma0(a)
360 lea 16*$SZ($inp),$inp
399 .type $TABLE,\@object
401 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
402 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
403 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
404 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
405 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
406 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
407 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
408 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
409 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
410 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
411 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
412 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
413 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
414 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
415 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
416 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
417 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
418 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
419 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
420 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
421 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
422 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
423 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
424 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
425 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
426 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
427 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
428 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
429 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
430 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
431 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
432 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
434 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
435 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
436 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
437 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
438 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
439 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
440 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
445 .type $TABLE,\@object
447 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
448 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
449 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
450 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
451 .quad 0x3956c25bf348b538,0x59f111f1b605d019
452 .quad 0x3956c25bf348b538,0x59f111f1b605d019
453 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
454 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
455 .quad 0xd807aa98a3030242,0x12835b0145706fbe
456 .quad 0xd807aa98a3030242,0x12835b0145706fbe
457 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
458 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
459 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
460 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
461 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
462 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
463 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
464 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
465 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
466 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
467 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
468 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
469 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
470 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
471 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
472 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
473 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
474 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
475 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
476 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
477 .quad 0x06ca6351e003826f,0x142929670a0e6e70
478 .quad 0x06ca6351e003826f,0x142929670a0e6e70
479 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
480 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
481 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
482 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
483 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
484 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
485 .quad 0x81c2c92e47edaee6,0x92722c851482353b
486 .quad 0x81c2c92e47edaee6,0x92722c851482353b
487 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
488 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
489 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
490 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
491 .quad 0xd192e819d6ef5218,0xd69906245565a910
492 .quad 0xd192e819d6ef5218,0xd69906245565a910
493 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
494 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
495 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
496 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
497 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
498 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
499 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
500 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
501 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
502 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
503 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
504 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
505 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
506 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
507 .quad 0x90befffa23631e28,0xa4506cebde82bde9
508 .quad 0x90befffa23631e28,0xa4506cebde82bde9
509 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
510 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
511 .quad 0xca273eceea26619c,0xd186b8c721c0c207
512 .quad 0xca273eceea26619c,0xd186b8c721c0c207
513 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
514 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
515 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
516 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
517 .quad 0x113f9804bef90dae,0x1b710b35131c471b
518 .quad 0x113f9804bef90dae,0x1b710b35131c471b
519 .quad 0x28db77f523047d84,0x32caab7b40c72493
520 .quad 0x28db77f523047d84,0x32caab7b40c72493
521 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
522 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
523 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
524 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
525 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
526 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
528 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
529 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
530 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
534 ######################################################################
537 if ($SZ==4 && $shaext) {{{
538 ######################################################################
539 # Intel SHA Extensions implementation of SHA256 update function.
541 my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
543 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
544 my @MSG=map("%xmm$_",(3..6));
547 .type sha256_block_data_order_shaext,\@function,3
549 sha256_block_data_order_shaext:
552 $code.=<<___ if ($win64);
553 lea `-8-5*16`(%rsp),%rsp
554 movaps %xmm6,-8-5*16(%rax)
555 movaps %xmm7,-8-4*16(%rax)
556 movaps %xmm8,-8-3*16(%rax)
557 movaps %xmm9,-8-2*16(%rax)
558 movaps %xmm10,-8-1*16(%rax)
562 lea K256+0x80(%rip),$Tbl
563 movdqu ($ctx),$ABEF # DCBA
564 movdqu 16($ctx),$CDGH # HGFE
565 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
567 pshufd \$0x1b,$ABEF,$Wi # ABCD
568 pshufd \$0xb1,$ABEF,$ABEF # CDAB
569 pshufd \$0x1b,$CDGH,$CDGH # EFGH
570 movdqa $TMP,$BSWAP # offload
571 palignr \$8,$CDGH,$ABEF # ABEF
572 punpcklqdq $Wi,$CDGH # CDGH
577 movdqu ($inp),@MSG[0]
578 movdqu 0x10($inp),@MSG[1]
579 movdqu 0x20($inp),@MSG[2]
581 movdqu 0x30($inp),@MSG[3]
583 movdqa 0*32-0x80($Tbl),$Wi
586 movdqa $CDGH,$CDGH_SAVE # offload
587 sha256rnds2 $ABEF,$CDGH # 0-3
588 pshufd \$0x0e,$Wi,$Wi
590 movdqa $ABEF,$ABEF_SAVE # offload
591 sha256rnds2 $CDGH,$ABEF
593 movdqa 1*32-0x80($Tbl),$Wi
596 sha256rnds2 $ABEF,$CDGH # 4-7
597 pshufd \$0x0e,$Wi,$Wi
599 sha256msg1 @MSG[1],@MSG[0]
600 sha256rnds2 $CDGH,$ABEF
602 movdqa 2*32-0x80($Tbl),$Wi
605 sha256rnds2 $ABEF,$CDGH # 8-11
606 pshufd \$0x0e,$Wi,$Wi
608 palignr \$4,@MSG[2],$TMP
611 sha256msg1 @MSG[2],@MSG[1]
612 sha256rnds2 $CDGH,$ABEF
614 movdqa 3*32-0x80($Tbl),$Wi
616 sha256msg2 @MSG[3],@MSG[0]
617 sha256rnds2 $ABEF,$CDGH # 12-15
618 pshufd \$0x0e,$Wi,$Wi
620 palignr \$4,@MSG[3],$TMP
623 sha256msg1 @MSG[3],@MSG[2]
624 sha256rnds2 $CDGH,$ABEF
626 for($i=4;$i<16-3;$i++) {
628 movdqa $i*32-0x80($Tbl),$Wi
630 sha256msg2 @MSG[0],@MSG[1]
631 sha256rnds2 $ABEF,$CDGH # 16-19...
632 pshufd \$0x0e,$Wi,$Wi
634 palignr \$4,@MSG[0],$TMP
637 sha256msg1 @MSG[0],@MSG[3]
638 sha256rnds2 $CDGH,$ABEF
640 push(@MSG,shift(@MSG));
643 movdqa 13*32-0x80($Tbl),$Wi
645 sha256msg2 @MSG[0],@MSG[1]
646 sha256rnds2 $ABEF,$CDGH # 52-55
647 pshufd \$0x0e,$Wi,$Wi
649 palignr \$4,@MSG[0],$TMP
650 sha256rnds2 $CDGH,$ABEF
653 movdqa 14*32-0x80($Tbl),$Wi
655 sha256rnds2 $ABEF,$CDGH # 56-59
656 pshufd \$0x0e,$Wi,$Wi
657 sha256msg2 @MSG[1],@MSG[2]
659 sha256rnds2 $CDGH,$ABEF
661 movdqa 15*32-0x80($Tbl),$Wi
664 sha256rnds2 $ABEF,$CDGH # 60-63
665 pshufd \$0x0e,$Wi,$Wi
668 sha256rnds2 $CDGH,$ABEF
670 paddd $CDGH_SAVE,$CDGH
671 paddd $ABEF_SAVE,$ABEF
674 pshufd \$0xb1,$CDGH,$CDGH # DCHG
675 pshufd \$0x1b,$ABEF,$TMP # FEBA
676 pshufd \$0xb1,$ABEF,$ABEF # BAFE
677 punpckhqdq $CDGH,$ABEF # DCBA
678 palignr \$8,$TMP,$CDGH # HGFE
681 movdqu $CDGH,16($ctx)
683 $code.=<<___ if ($win64);
684 movaps -8-5*16(%rax),%xmm6
685 movaps -8-4*16(%rax),%xmm7
686 movaps -8-3*16(%rax),%xmm8
687 movaps -8-2*16(%rax),%xmm9
688 movaps -8-1*16(%rax),%xmm10
694 .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
700 my ($a,$b,$c,$d,$e,$f,$g,$h);
702 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
703 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
705 $arg = "\$$arg" if ($arg*1 eq $arg);
706 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
711 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
713 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
717 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
719 '&xor ($a4,$g)', # f^g
721 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
723 '&and ($a4,$e)', # (f^g)&e
726 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
729 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
730 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
731 '&xor ($a2,$b)', # a^b, b^c in next round
733 '&add ($h,$a4)', # h+=Ch(e,f,g)
734 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
735 '&and ($a3,$a2)', # (b^c)&(a^b)
738 '&add ($h,$a0)', # h+=Sigma1(e)
739 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
741 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
742 '&add ($d,$h)', # d+=h
743 '&add ($h,$a3)', # h+=Maj(a,b,c)
746 '&add ($a1,$h);'. # h+=Sigma0(a)
747 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
751 ######################################################################
754 if ($SZ==4) { # SHA256 only
755 my @X = map("%xmm$_",(0..3));
756 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
759 .type ${func}_ssse3,\@function,3
769 mov %rsp,%r11 # copy %rsp
770 shl \$4,%rdx # num*16
771 sub \$`$framesz+$win64*16*4`,%rsp
772 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
773 and \$-64,%rsp # align stack frame
774 mov $ctx,$_ctx # save ctx, 1st arg
775 mov $inp,$_inp # save inp, 2nd arh
776 mov %rdx,$_end # save end pointer, "3rd" arg
777 mov %r11,$_rsp # save copy of %rsp
779 $code.=<<___ if ($win64);
780 movaps %xmm6,16*$SZ+32(%rsp)
781 movaps %xmm7,16*$SZ+48(%rsp)
782 movaps %xmm8,16*$SZ+64(%rsp)
783 movaps %xmm9,16*$SZ+80(%rsp)
799 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
800 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
804 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
805 movdqu 0x00($inp),@X[0]
806 movdqu 0x10($inp),@X[1]
807 movdqu 0x20($inp),@X[2]
809 movdqu 0x30($inp),@X[3]
810 lea $TABLE(%rip),$Tbl
812 movdqa 0x00($Tbl),$t0
813 movdqa 0x20($Tbl),$t1
816 movdqa 0x40($Tbl),$t2
818 movdqa 0x60($Tbl),$t3
822 movdqa $t0,0x00(%rsp)
824 movdqa $t1,0x10(%rsp)
826 movdqa $t2,0x20(%rsp)
828 movdqa $t3,0x30(%rsp)
834 sub \$`-16*2*$SZ`,$Tbl # size optimization
836 sub Xupdate_256_SSSE3 () {
838 '&movdqa ($t0,@X[1]);',
839 '&movdqa ($t3,@X[3])',
840 '&palignr ($t0,@X[0],$SZ)', # X[1..4]
841 '&palignr ($t3,@X[2],$SZ);', # X[9..12]
843 '&movdqa ($t2,$t0);',
844 '&psrld ($t0,$sigma0[2])',
845 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
846 '&psrld ($t2,$sigma0[0])',
847 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
848 '&pslld ($t1,8*$SZ-$sigma0[1]);'.
850 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
852 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
855 '&pxor ($t0,$t1);', # sigma0(X[1..4])
856 '&psrld ($t3,$sigma1[2])',
857 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
858 '&psrlq ($t2,$sigma1[0])',
860 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
862 '&pshufb ($t3,$t4)', # sigma1(X[14..15])
863 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
864 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
865 '&movdqa ($t2,$t3);',
866 '&psrld ($t3,$sigma1[2])',
867 '&psrlq ($t2,$sigma1[0])',
869 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
871 '&movdqa ($t2,16*2*$j."($Tbl)")',
873 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
877 sub SSSE3_256_00_47 () {
881 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
884 foreach (Xupdate_256_SSSE3()) { # 36 instructions
890 } else { # squeeze extra 4% on Westmere and 19% on Atom
891 eval(shift(@insns)); #@
896 eval(shift(@insns)); #@
899 eval(shift(@insns)); #@
901 &palignr ($t0,@X[0],$SZ); # X[1..4]
904 &palignr ($t3,@X[2],$SZ); # X[9..12]
908 eval(shift(@insns)); #@
913 eval(shift(@insns)); #@
915 &psrld ($t0,$sigma0[2]);
919 &paddd (@X[0],$t3); # X[0..3] += X[9..12]
920 eval(shift(@insns)); #@
922 &psrld ($t2,$sigma0[0]);
925 &pshufd ($t3,@X[3],0b11111010); # X[4..15]
927 eval(shift(@insns)); #@
928 &pslld ($t1,8*$SZ-$sigma0[1]);
932 eval(shift(@insns)); #@
935 eval(shift(@insns)); #@
936 &psrld ($t2,$sigma0[1]-$sigma0[0]);
941 &pslld ($t1,$sigma0[1]-$sigma0[0]);
946 eval(shift(@insns)); #@
950 &pxor ($t0,$t1); # sigma0(X[1..4])
951 eval(shift(@insns)); #@
954 &psrld ($t3,$sigma1[2]);
957 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
958 eval(shift(@insns)); #@
960 &psrlq ($t2,$sigma1[0]);
965 eval(shift(@insns)); #@
968 eval(shift(@insns)); #@
969 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
973 eval(shift(@insns)); #@
976 #&pshufb ($t3,$t4); # sigma1(X[14..15])
977 &pshufd ($t3,$t3,0b10000000);
983 eval(shift(@insns)); #@
986 eval(shift(@insns)); #@
987 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
991 &pshufd ($t3,@X[0],0b01010000); # X[16..17]
993 eval(shift(@insns)); #@
998 &psrld ($t3,$sigma1[2]);
1000 eval(shift(@insns)); #@
1001 &psrlq ($t2,$sigma1[0]);
1002 eval(shift(@insns));
1003 eval(shift(@insns));
1005 eval(shift(@insns)); #@
1006 eval(shift(@insns));
1007 eval(shift(@insns));
1008 eval(shift(@insns)); #@
1009 eval(shift(@insns));
1010 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
1011 eval(shift(@insns));
1012 eval(shift(@insns));
1013 eval(shift(@insns));
1015 eval(shift(@insns));
1016 eval(shift(@insns));
1017 eval(shift(@insns)); #@
1019 &pshufd ($t3,$t3,0b00001000);
1020 eval(shift(@insns));
1021 eval(shift(@insns));
1022 &movdqa ($t2,16*2*$j."($Tbl)");
1023 eval(shift(@insns)); #@
1024 eval(shift(@insns));
1026 eval(shift(@insns));
1027 eval(shift(@insns));
1028 eval(shift(@insns));
1029 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1030 eval(shift(@insns)); #@
1031 eval(shift(@insns));
1032 eval(shift(@insns));
1035 foreach (@insns) { eval; } # remaining instructions
1036 &movdqa (16*$j."(%rsp)",$t2);
1039 for ($i=0,$j=0; $j<4; $j++) {
1040 &SSSE3_256_00_47($j,\&body_00_15,@X);
1041 push(@X,shift(@X)); # rotate(@X)
1043 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1044 &jne (".Lssse3_00_47");
1046 for ($i=0; $i<16; ) {
1047 foreach(body_00_15()) { eval; }
1054 lea 16*$SZ($inp),$inp
1077 $code.=<<___ if ($win64);
1078 movaps 16*$SZ+32(%rsp),%xmm6
1079 movaps 16*$SZ+48(%rsp),%xmm7
1080 movaps 16*$SZ+64(%rsp),%xmm8
1081 movaps 16*$SZ+80(%rsp),%xmm9
1093 .size ${func}_ssse3,.-${func}_ssse3
1098 ######################################################################
1101 if ($SZ==8) { # SHA512 only
1103 .type ${func}_xop,\@function,3
1113 mov %rsp,%r11 # copy %rsp
1114 shl \$4,%rdx # num*16
1115 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1116 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1117 and \$-64,%rsp # align stack frame
1118 mov $ctx,$_ctx # save ctx, 1st arg
1119 mov $inp,$_inp # save inp, 2nd arh
1120 mov %rdx,$_end # save end pointer, "3rd" arg
1121 mov %r11,$_rsp # save copy of %rsp
1123 $code.=<<___ if ($win64);
1124 movaps %xmm6,16*$SZ+32(%rsp)
1125 movaps %xmm7,16*$SZ+48(%rsp)
1126 movaps %xmm8,16*$SZ+64(%rsp)
1127 movaps %xmm9,16*$SZ+80(%rsp)
1129 $code.=<<___ if ($win64 && $SZ>4);
1130 movaps %xmm10,16*$SZ+96(%rsp)
1131 movaps %xmm11,16*$SZ+112(%rsp)
1147 if ($SZ==4) { # SHA256
1148 my @X = map("%xmm$_",(0..3));
1149 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1154 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1155 vmovdqu 0x00($inp),@X[0]
1156 vmovdqu 0x10($inp),@X[1]
1157 vmovdqu 0x20($inp),@X[2]
1158 vmovdqu 0x30($inp),@X[3]
1159 vpshufb $t3,@X[0],@X[0]
1160 lea $TABLE(%rip),$Tbl
1161 vpshufb $t3,@X[1],@X[1]
1162 vpshufb $t3,@X[2],@X[2]
1163 vpaddd 0x00($Tbl),@X[0],$t0
1164 vpshufb $t3,@X[3],@X[3]
1165 vpaddd 0x20($Tbl),@X[1],$t1
1166 vpaddd 0x40($Tbl),@X[2],$t2
1167 vpaddd 0x60($Tbl),@X[3],$t3
1168 vmovdqa $t0,0x00(%rsp)
1170 vmovdqa $t1,0x10(%rsp)
1172 vmovdqa $t2,0x20(%rsp)
1174 vmovdqa $t3,0x30(%rsp)
1180 sub \$`-16*2*$SZ`,$Tbl # size optimization
1182 sub XOP_256_00_47 () {
1186 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1188 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
1189 eval(shift(@insns));
1190 eval(shift(@insns));
1191 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
1192 eval(shift(@insns));
1193 eval(shift(@insns));
1194 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
1195 eval(shift(@insns));
1196 eval(shift(@insns));
1197 &vpsrld ($t0,$t0,$sigma0[2]);
1198 eval(shift(@insns));
1199 eval(shift(@insns));
1200 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
1201 eval(shift(@insns));
1202 eval(shift(@insns));
1203 eval(shift(@insns));
1204 eval(shift(@insns));
1205 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
1206 eval(shift(@insns));
1207 eval(shift(@insns));
1208 &vpxor ($t0,$t0,$t1);
1209 eval(shift(@insns));
1210 eval(shift(@insns));
1211 eval(shift(@insns));
1212 eval(shift(@insns));
1213 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
1214 eval(shift(@insns));
1215 eval(shift(@insns));
1216 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
1217 eval(shift(@insns));
1218 eval(shift(@insns));
1219 &vpsrld ($t2,@X[3],$sigma1[2]);
1220 eval(shift(@insns));
1221 eval(shift(@insns));
1222 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
1223 eval(shift(@insns));
1224 eval(shift(@insns));
1225 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1226 eval(shift(@insns));
1227 eval(shift(@insns));
1228 &vpxor ($t3,$t3,$t2);
1229 eval(shift(@insns));
1230 eval(shift(@insns));
1231 eval(shift(@insns));
1232 eval(shift(@insns));
1233 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1234 eval(shift(@insns));
1235 eval(shift(@insns));
1236 eval(shift(@insns));
1237 eval(shift(@insns));
1238 &vpsrldq ($t3,$t3,8);
1239 eval(shift(@insns));
1240 eval(shift(@insns));
1241 eval(shift(@insns));
1242 eval(shift(@insns));
1243 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1244 eval(shift(@insns));
1245 eval(shift(@insns));
1246 eval(shift(@insns));
1247 eval(shift(@insns));
1248 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
1249 eval(shift(@insns));
1250 eval(shift(@insns));
1251 &vpsrld ($t2,@X[0],$sigma1[2]);
1252 eval(shift(@insns));
1253 eval(shift(@insns));
1254 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1255 eval(shift(@insns));
1256 eval(shift(@insns));
1257 &vpxor ($t3,$t3,$t2);
1258 eval(shift(@insns));
1259 eval(shift(@insns));
1260 eval(shift(@insns));
1261 eval(shift(@insns));
1262 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
1263 eval(shift(@insns));
1264 eval(shift(@insns));
1265 eval(shift(@insns));
1266 eval(shift(@insns));
1267 &vpslldq ($t3,$t3,8); # 22 instructions
1268 eval(shift(@insns));
1269 eval(shift(@insns));
1270 eval(shift(@insns));
1271 eval(shift(@insns));
1272 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1273 eval(shift(@insns));
1274 eval(shift(@insns));
1275 eval(shift(@insns));
1276 eval(shift(@insns));
1277 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1278 foreach (@insns) { eval; } # remaining instructions
1279 &vmovdqa (16*$j."(%rsp)",$t2);
1282 for ($i=0,$j=0; $j<4; $j++) {
1283 &XOP_256_00_47($j,\&body_00_15,@X);
1284 push(@X,shift(@X)); # rotate(@X)
1286 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1287 &jne (".Lxop_00_47");
1289 for ($i=0; $i<16; ) {
1290 foreach(body_00_15()) { eval; }
1294 my @X = map("%xmm$_",(0..7));
1295 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1300 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1301 vmovdqu 0x00($inp),@X[0]
1302 lea $TABLE+0x80(%rip),$Tbl # size optimization
1303 vmovdqu 0x10($inp),@X[1]
1304 vmovdqu 0x20($inp),@X[2]
1305 vpshufb $t3,@X[0],@X[0]
1306 vmovdqu 0x30($inp),@X[3]
1307 vpshufb $t3,@X[1],@X[1]
1308 vmovdqu 0x40($inp),@X[4]
1309 vpshufb $t3,@X[2],@X[2]
1310 vmovdqu 0x50($inp),@X[5]
1311 vpshufb $t3,@X[3],@X[3]
1312 vmovdqu 0x60($inp),@X[6]
1313 vpshufb $t3,@X[4],@X[4]
1314 vmovdqu 0x70($inp),@X[7]
1315 vpshufb $t3,@X[5],@X[5]
1316 vpaddq -0x80($Tbl),@X[0],$t0
1317 vpshufb $t3,@X[6],@X[6]
1318 vpaddq -0x60($Tbl),@X[1],$t1
1319 vpshufb $t3,@X[7],@X[7]
1320 vpaddq -0x40($Tbl),@X[2],$t2
1321 vpaddq -0x20($Tbl),@X[3],$t3
1322 vmovdqa $t0,0x00(%rsp)
1323 vpaddq 0x00($Tbl),@X[4],$t0
1324 vmovdqa $t1,0x10(%rsp)
1325 vpaddq 0x20($Tbl),@X[5],$t1
1326 vmovdqa $t2,0x20(%rsp)
1327 vpaddq 0x40($Tbl),@X[6],$t2
1328 vmovdqa $t3,0x30(%rsp)
1329 vpaddq 0x60($Tbl),@X[7],$t3
1330 vmovdqa $t0,0x40(%rsp)
1332 vmovdqa $t1,0x50(%rsp)
1334 vmovdqa $t2,0x60(%rsp)
1336 vmovdqa $t3,0x70(%rsp)
1342 add \$`16*2*$SZ`,$Tbl
1344 sub XOP_512_00_47 () {
1348 my @insns = (&$body,&$body); # 52 instructions
1350 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
1351 eval(shift(@insns));
1352 eval(shift(@insns));
1353 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
1354 eval(shift(@insns));
1355 eval(shift(@insns));
1356 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
1357 eval(shift(@insns));
1358 eval(shift(@insns));
1359 &vpsrlq ($t0,$t0,$sigma0[2]);
1360 eval(shift(@insns));
1361 eval(shift(@insns));
1362 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
1363 eval(shift(@insns));
1364 eval(shift(@insns));
1365 eval(shift(@insns));
1366 eval(shift(@insns));
1367 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
1368 eval(shift(@insns));
1369 eval(shift(@insns));
1370 &vpxor ($t0,$t0,$t1);
1371 eval(shift(@insns));
1372 eval(shift(@insns));
1373 eval(shift(@insns));
1374 eval(shift(@insns));
1375 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
1376 eval(shift(@insns));
1377 eval(shift(@insns));
1378 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
1379 eval(shift(@insns));
1380 eval(shift(@insns));
1381 &vpsrlq ($t2,@X[7],$sigma1[2]);
1382 eval(shift(@insns));
1383 eval(shift(@insns));
1384 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
1385 eval(shift(@insns));
1386 eval(shift(@insns));
1387 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
1388 eval(shift(@insns));
1389 eval(shift(@insns));
1390 &vpxor ($t3,$t3,$t2);
1391 eval(shift(@insns));
1392 eval(shift(@insns));
1393 eval(shift(@insns));
1394 eval(shift(@insns));
1395 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1396 eval(shift(@insns));
1397 eval(shift(@insns));
1398 eval(shift(@insns));
1399 eval(shift(@insns));
1400 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1401 eval(shift(@insns));
1402 eval(shift(@insns));
1403 eval(shift(@insns));
1404 eval(shift(@insns));
1405 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1406 foreach (@insns) { eval; } # remaining instructions
1407 &vmovdqa (16*$j."(%rsp)",$t2);
1410 for ($i=0,$j=0; $j<8; $j++) {
1411 &XOP_512_00_47($j,\&body_00_15,@X);
1412 push(@X,shift(@X)); # rotate(@X)
1414 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1415 &jne (".Lxop_00_47");
1417 for ($i=0; $i<16; ) {
1418 foreach(body_00_15()) { eval; }
1426 lea 16*$SZ($inp),$inp
1450 $code.=<<___ if ($win64);
1451 movaps 16*$SZ+32(%rsp),%xmm6
1452 movaps 16*$SZ+48(%rsp),%xmm7
1453 movaps 16*$SZ+64(%rsp),%xmm8
1454 movaps 16*$SZ+80(%rsp),%xmm9
1456 $code.=<<___ if ($win64 && $SZ>4);
1457 movaps 16*$SZ+96(%rsp),%xmm10
1458 movaps 16*$SZ+112(%rsp),%xmm11
1470 .size ${func}_xop,.-${func}_xop
1473 ######################################################################
1474 # AVX+shrd code path
1476 local *ror = sub { &shrd(@_[0],@_) };
1479 .type ${func}_avx,\@function,3
1489 mov %rsp,%r11 # copy %rsp
1490 shl \$4,%rdx # num*16
1491 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1492 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1493 and \$-64,%rsp # align stack frame
1494 mov $ctx,$_ctx # save ctx, 1st arg
1495 mov $inp,$_inp # save inp, 2nd arh
1496 mov %rdx,$_end # save end pointer, "3rd" arg
1497 mov %r11,$_rsp # save copy of %rsp
1499 $code.=<<___ if ($win64);
1500 movaps %xmm6,16*$SZ+32(%rsp)
1501 movaps %xmm7,16*$SZ+48(%rsp)
1502 movaps %xmm8,16*$SZ+64(%rsp)
1503 movaps %xmm9,16*$SZ+80(%rsp)
1505 $code.=<<___ if ($win64 && $SZ>4);
1506 movaps %xmm10,16*$SZ+96(%rsp)
1507 movaps %xmm11,16*$SZ+112(%rsp)
1522 if ($SZ==4) { # SHA256
1523 my @X = map("%xmm$_",(0..3));
1524 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1527 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1528 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1532 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1533 vmovdqu 0x00($inp),@X[0]
1534 vmovdqu 0x10($inp),@X[1]
1535 vmovdqu 0x20($inp),@X[2]
1536 vmovdqu 0x30($inp),@X[3]
1537 vpshufb $t3,@X[0],@X[0]
1538 lea $TABLE(%rip),$Tbl
1539 vpshufb $t3,@X[1],@X[1]
1540 vpshufb $t3,@X[2],@X[2]
1541 vpaddd 0x00($Tbl),@X[0],$t0
1542 vpshufb $t3,@X[3],@X[3]
1543 vpaddd 0x20($Tbl),@X[1],$t1
1544 vpaddd 0x40($Tbl),@X[2],$t2
1545 vpaddd 0x60($Tbl),@X[3],$t3
1546 vmovdqa $t0,0x00(%rsp)
1548 vmovdqa $t1,0x10(%rsp)
1550 vmovdqa $t2,0x20(%rsp)
1552 vmovdqa $t3,0x30(%rsp)
1558 sub \$`-16*2*$SZ`,$Tbl # size optimization
1560 sub Xupdate_256_AVX () {
1562 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
1563 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
1564 '&vpsrld ($t2,$t0,$sigma0[0]);',
1565 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
1566 '&vpsrld ($t3,$t0,$sigma0[2])',
1567 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
1568 '&vpxor ($t0,$t3,$t2)',
1569 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
1570 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1571 '&vpxor ($t0,$t0,$t1)',
1572 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1573 '&vpxor ($t0,$t0,$t2)',
1574 '&vpsrld ($t2,$t3,$sigma1[2]);',
1575 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
1576 '&vpsrlq ($t3,$t3,$sigma1[0]);',
1577 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
1578 '&vpxor ($t2,$t2,$t3);',
1579 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1580 '&vpxor ($t2,$t2,$t3)',
1581 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
1582 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
1583 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
1584 '&vpsrld ($t2,$t3,$sigma1[2])',
1585 '&vpsrlq ($t3,$t3,$sigma1[0])',
1586 '&vpxor ($t2,$t2,$t3);',
1587 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1588 '&vpxor ($t2,$t2,$t3)',
1589 '&vpshufb ($t2,$t2,$t5)',
1590 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
1594 sub AVX_256_00_47 () {
1598 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1600 foreach (Xupdate_256_AVX()) { # 29 instructions
1602 eval(shift(@insns));
1603 eval(shift(@insns));
1604 eval(shift(@insns));
1606 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1607 foreach (@insns) { eval; } # remaining instructions
1608 &vmovdqa (16*$j."(%rsp)",$t2);
1611 for ($i=0,$j=0; $j<4; $j++) {
1612 &AVX_256_00_47($j,\&body_00_15,@X);
1613 push(@X,shift(@X)); # rotate(@X)
1615 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1616 &jne (".Lavx_00_47");
1618 for ($i=0; $i<16; ) {
1619 foreach(body_00_15()) { eval; }
1623 my @X = map("%xmm$_",(0..7));
1624 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1630 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1631 vmovdqu 0x00($inp),@X[0]
1632 lea $TABLE+0x80(%rip),$Tbl # size optimization
1633 vmovdqu 0x10($inp),@X[1]
1634 vmovdqu 0x20($inp),@X[2]
1635 vpshufb $t3,@X[0],@X[0]
1636 vmovdqu 0x30($inp),@X[3]
1637 vpshufb $t3,@X[1],@X[1]
1638 vmovdqu 0x40($inp),@X[4]
1639 vpshufb $t3,@X[2],@X[2]
1640 vmovdqu 0x50($inp),@X[5]
1641 vpshufb $t3,@X[3],@X[3]
1642 vmovdqu 0x60($inp),@X[6]
1643 vpshufb $t3,@X[4],@X[4]
1644 vmovdqu 0x70($inp),@X[7]
1645 vpshufb $t3,@X[5],@X[5]
1646 vpaddq -0x80($Tbl),@X[0],$t0
1647 vpshufb $t3,@X[6],@X[6]
1648 vpaddq -0x60($Tbl),@X[1],$t1
1649 vpshufb $t3,@X[7],@X[7]
1650 vpaddq -0x40($Tbl),@X[2],$t2
1651 vpaddq -0x20($Tbl),@X[3],$t3
1652 vmovdqa $t0,0x00(%rsp)
1653 vpaddq 0x00($Tbl),@X[4],$t0
1654 vmovdqa $t1,0x10(%rsp)
1655 vpaddq 0x20($Tbl),@X[5],$t1
1656 vmovdqa $t2,0x20(%rsp)
1657 vpaddq 0x40($Tbl),@X[6],$t2
1658 vmovdqa $t3,0x30(%rsp)
1659 vpaddq 0x60($Tbl),@X[7],$t3
1660 vmovdqa $t0,0x40(%rsp)
1662 vmovdqa $t1,0x50(%rsp)
1664 vmovdqa $t2,0x60(%rsp)
1666 vmovdqa $t3,0x70(%rsp)
1672 add \$`16*2*$SZ`,$Tbl
1674 sub Xupdate_512_AVX () {
1676 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
1677 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
1678 '&vpsrlq ($t2,$t0,$sigma0[0])',
1679 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
1680 '&vpsrlq ($t3,$t0,$sigma0[2])',
1681 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
1682 '&vpxor ($t0,$t3,$t2)',
1683 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1684 '&vpxor ($t0,$t0,$t1)',
1685 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1686 '&vpxor ($t0,$t0,$t2)',
1687 '&vpsrlq ($t3,@X[7],$sigma1[2]);',
1688 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
1689 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
1690 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
1691 '&vpsrlq ($t1,@X[7],$sigma1[0]);',
1692 '&vpxor ($t3,$t3,$t2)',
1693 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
1694 '&vpxor ($t3,$t3,$t1)',
1695 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
1696 '&vpxor ($t3,$t3,$t2)',
1697 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
1698 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
1702 sub AVX_512_00_47 () {
1706 my @insns = (&$body,&$body); # 52 instructions
1708 foreach (Xupdate_512_AVX()) { # 23 instructions
1710 eval(shift(@insns));
1711 eval(shift(@insns));
1713 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1714 foreach (@insns) { eval; } # remaining instructions
1715 &vmovdqa (16*$j."(%rsp)",$t2);
1718 for ($i=0,$j=0; $j<8; $j++) {
1719 &AVX_512_00_47($j,\&body_00_15,@X);
1720 push(@X,shift(@X)); # rotate(@X)
1722 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1723 &jne (".Lavx_00_47");
1725 for ($i=0; $i<16; ) {
1726 foreach(body_00_15()) { eval; }
1734 lea 16*$SZ($inp),$inp
1758 $code.=<<___ if ($win64);
1759 movaps 16*$SZ+32(%rsp),%xmm6
1760 movaps 16*$SZ+48(%rsp),%xmm7
1761 movaps 16*$SZ+64(%rsp),%xmm8
1762 movaps 16*$SZ+80(%rsp),%xmm9
1764 $code.=<<___ if ($win64 && $SZ>4);
1765 movaps 16*$SZ+96(%rsp),%xmm10
1766 movaps 16*$SZ+112(%rsp),%xmm11
1778 .size ${func}_avx,.-${func}_avx
1782 ######################################################################
1783 # AVX2+BMI code path
1785 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
1789 sub bodyx_00_15 () {
1790 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1792 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1794 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
1795 '&and ($a4,$e)', # f&e
1796 '&rorx ($a0,$e,$Sigma1[2])',
1797 '&rorx ($a2,$e,$Sigma1[1])',
1799 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
1800 '&lea ($h,"($h,$a4)")',
1801 '&andn ($a4,$e,$g)', # ~e&g
1804 '&rorx ($a1,$e,$Sigma1[0])',
1805 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
1806 '&xor ($a0,$a1)', # Sigma1(e)
1809 '&rorx ($a4,$a,$Sigma0[2])',
1810 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
1811 '&xor ($a2,$b)', # a^b, b^c in next round
1812 '&rorx ($a1,$a,$Sigma0[1])',
1814 '&rorx ($a0,$a,$Sigma0[0])',
1815 '&lea ($d,"($d,$h)")', # d+=h
1816 '&and ($a3,$a2)', # (b^c)&(a^b)
1819 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
1820 '&xor ($a1,$a0)', # Sigma0(a)
1821 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
1822 '&mov ($a4,$e)', # copy of f in future
1824 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1826 # and at the finish one has to $a+=$a1
1830 .type ${func}_avx2,\@function,3
1840 mov %rsp,%r11 # copy %rsp
1841 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1842 shl \$4,%rdx # num*16
1843 and \$-256*$SZ,%rsp # align stack frame
1844 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1845 add \$`2*$SZ*($rounds-8)`,%rsp
1846 mov $ctx,$_ctx # save ctx, 1st arg
1847 mov $inp,$_inp # save inp, 2nd arh
1848 mov %rdx,$_end # save end pointer, "3rd" arg
1849 mov %r11,$_rsp # save copy of %rsp
1851 $code.=<<___ if ($win64);
1852 movaps %xmm6,16*$SZ+32(%rsp)
1853 movaps %xmm7,16*$SZ+48(%rsp)
1854 movaps %xmm8,16*$SZ+64(%rsp)
1855 movaps %xmm9,16*$SZ+80(%rsp)
1857 $code.=<<___ if ($win64 && $SZ>4);
1858 movaps %xmm10,16*$SZ+96(%rsp)
1859 movaps %xmm11,16*$SZ+112(%rsp)
1865 sub \$-16*$SZ,$inp # inp++, size optimization
1867 mov $inp,%r12 # borrow $T1
1869 cmp %rdx,$inp # $_end
1871 cmove %rsp,%r12 # next block or random data
1878 if ($SZ==4) { # SHA256
1879 my @X = map("%ymm$_",(0..3));
1880 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1883 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1884 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1888 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1889 vmovdqu -16*$SZ+0($inp),%xmm0
1890 vmovdqu -16*$SZ+16($inp),%xmm1
1891 vmovdqu -16*$SZ+32($inp),%xmm2
1892 vmovdqu -16*$SZ+48($inp),%xmm3
1893 #mov $inp,$_inp # offload $inp
1894 vinserti128 \$1,(%r12),@X[0],@X[0]
1895 vinserti128 \$1,16(%r12),@X[1],@X[1]
1896 vpshufb $t3,@X[0],@X[0]
1897 vinserti128 \$1,32(%r12),@X[2],@X[2]
1898 vpshufb $t3,@X[1],@X[1]
1899 vinserti128 \$1,48(%r12),@X[3],@X[3]
1901 lea $TABLE(%rip),$Tbl
1902 vpshufb $t3,@X[2],@X[2]
1903 vpaddd 0x00($Tbl),@X[0],$t0
1904 vpshufb $t3,@X[3],@X[3]
1905 vpaddd 0x20($Tbl),@X[1],$t1
1906 vpaddd 0x40($Tbl),@X[2],$t2
1907 vpaddd 0x60($Tbl),@X[3],$t3
1908 vmovdqa $t0,0x00(%rsp)
1910 vmovdqa $t1,0x20(%rsp)
1911 lea -$PUSH8(%rsp),%rsp
1913 vmovdqa $t2,0x00(%rsp)
1915 vmovdqa $t3,0x20(%rsp)
1917 sub \$-16*2*$SZ,$Tbl # size optimization
1924 sub AVX2_256_00_47 () {
1928 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1929 my $base = "+2*$PUSH8(%rsp)";
1931 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1932 foreach (Xupdate_256_AVX()) { # 29 instructions
1934 eval(shift(@insns));
1935 eval(shift(@insns));
1936 eval(shift(@insns));
1938 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1939 foreach (@insns) { eval; } # remaining instructions
1940 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1943 for ($i=0,$j=0; $j<4; $j++) {
1944 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1945 push(@X,shift(@X)); # rotate(@X)
1947 &lea ($Tbl,16*2*$SZ."($Tbl)");
1948 &cmpb (($SZ-1)."($Tbl)",0);
1949 &jne (".Lavx2_00_47");
1951 for ($i=0; $i<16; ) {
1952 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1953 foreach(bodyx_00_15()) { eval; }
1956 my @X = map("%ymm$_",(0..7));
1957 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
1963 vmovdqu -16*$SZ($inp),%xmm0
1964 vmovdqu -16*$SZ+16($inp),%xmm1
1965 vmovdqu -16*$SZ+32($inp),%xmm2
1966 lea $TABLE+0x80(%rip),$Tbl # size optimization
1967 vmovdqu -16*$SZ+48($inp),%xmm3
1968 vmovdqu -16*$SZ+64($inp),%xmm4
1969 vmovdqu -16*$SZ+80($inp),%xmm5
1970 vmovdqu -16*$SZ+96($inp),%xmm6
1971 vmovdqu -16*$SZ+112($inp),%xmm7
1972 #mov $inp,$_inp # offload $inp
1973 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
1974 vinserti128 \$1,(%r12),@X[0],@X[0]
1975 vinserti128 \$1,16(%r12),@X[1],@X[1]
1976 vpshufb $t2,@X[0],@X[0]
1977 vinserti128 \$1,32(%r12),@X[2],@X[2]
1978 vpshufb $t2,@X[1],@X[1]
1979 vinserti128 \$1,48(%r12),@X[3],@X[3]
1980 vpshufb $t2,@X[2],@X[2]
1981 vinserti128 \$1,64(%r12),@X[4],@X[4]
1982 vpshufb $t2,@X[3],@X[3]
1983 vinserti128 \$1,80(%r12),@X[5],@X[5]
1984 vpshufb $t2,@X[4],@X[4]
1985 vinserti128 \$1,96(%r12),@X[6],@X[6]
1986 vpshufb $t2,@X[5],@X[5]
1987 vinserti128 \$1,112(%r12),@X[7],@X[7]
1989 vpaddq -0x80($Tbl),@X[0],$t0
1990 vpshufb $t2,@X[6],@X[6]
1991 vpaddq -0x60($Tbl),@X[1],$t1
1992 vpshufb $t2,@X[7],@X[7]
1993 vpaddq -0x40($Tbl),@X[2],$t2
1994 vpaddq -0x20($Tbl),@X[3],$t3
1995 vmovdqa $t0,0x00(%rsp)
1996 vpaddq 0x00($Tbl),@X[4],$t0
1997 vmovdqa $t1,0x20(%rsp)
1998 vpaddq 0x20($Tbl),@X[5],$t1
1999 vmovdqa $t2,0x40(%rsp)
2000 vpaddq 0x40($Tbl),@X[6],$t2
2001 vmovdqa $t3,0x60(%rsp)
2002 lea -$PUSH8(%rsp),%rsp
2003 vpaddq 0x60($Tbl),@X[7],$t3
2004 vmovdqa $t0,0x00(%rsp)
2006 vmovdqa $t1,0x20(%rsp)
2008 vmovdqa $t2,0x40(%rsp)
2010 vmovdqa $t3,0x60(%rsp)
2019 sub AVX2_512_00_47 () {
2023 my @insns = (&$body,&$body); # 48 instructions
2024 my $base = "+2*$PUSH8(%rsp)";
2026 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0);
2027 foreach (Xupdate_512_AVX()) { # 23 instructions
2030 eval(shift(@insns));
2031 eval(shift(@insns));
2032 eval(shift(@insns));
2035 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
2036 foreach (@insns) { eval; } # remaining instructions
2037 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2040 for ($i=0,$j=0; $j<8; $j++) {
2041 &AVX2_512_00_47($j,\&bodyx_00_15,@X);
2042 push(@X,shift(@X)); # rotate(@X)
2044 &lea ($Tbl,16*2*$SZ."($Tbl)");
2045 &cmpb (($SZ-1-0x80)."($Tbl)",0);
2046 &jne (".Lavx2_00_47");
2048 for ($i=0; $i<16; ) {
2049 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2050 foreach(bodyx_00_15()) { eval; }
2054 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2056 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2057 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
2077 cmp `$PUSH8+2*8`($Tbl),$inp # $_end
2088 for ($i=0; $i<8; ) {
2089 my $base="+16($Tbl)";
2090 foreach(bodyx_00_15()) { eval; }
2093 lea -$PUSH8($Tbl),$Tbl
2097 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2099 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2100 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
2108 lea `2*16*$SZ`($inp),$inp # inp+=2
2115 cmove %rsp,%r12 # next block or stale data
2132 $code.=<<___ if ($win64);
2133 movaps 16*$SZ+32(%rsp),%xmm6
2134 movaps 16*$SZ+48(%rsp),%xmm7
2135 movaps 16*$SZ+64(%rsp),%xmm8
2136 movaps 16*$SZ+80(%rsp),%xmm9
2138 $code.=<<___ if ($win64 && $SZ>4);
2139 movaps 16*$SZ+96(%rsp),%xmm10
2140 movaps 16*$SZ+112(%rsp),%xmm11
2152 .size ${func}_avx2,.-${func}_avx2
2157 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2158 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2166 .extern __imp_RtlVirtualUnwind
2167 .type se_handler,\@abi-omnipotent
2181 mov 120($context),%rax # pull context->Rax
2182 mov 248($context),%rbx # pull context->Rip
2184 mov 8($disp),%rsi # disp->ImageBase
2185 mov 56($disp),%r11 # disp->HanderlData
2187 mov 0(%r11),%r10d # HandlerData[0]
2188 lea (%rsi,%r10),%r10 # prologue label
2189 cmp %r10,%rbx # context->Rip<prologue label
2192 mov 152($context),%rax # pull context->Rsp
2194 mov 4(%r11),%r10d # HandlerData[1]
2195 lea (%rsi,%r10),%r10 # epilogue label
2196 cmp %r10,%rbx # context->Rip>=epilogue label
2199 $code.=<<___ if ($avx>1);
2200 lea .Lavx2_shortcut(%rip),%r10
2201 cmp %r10,%rbx # context->Rip<avx2_shortcut
2205 add \$`2*$SZ*($rounds-8)`,%rax
2209 mov %rax,%rsi # put aside Rsp
2210 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
2219 mov %rbx,144($context) # restore context->Rbx
2220 mov %rbp,160($context) # restore context->Rbp
2221 mov %r12,216($context) # restore context->R12
2222 mov %r13,224($context) # restore context->R13
2223 mov %r14,232($context) # restore context->R14
2224 mov %r15,240($context) # restore context->R15
2226 lea .Lepilogue(%rip),%r10
2228 jb .Lin_prologue # non-AVX code
2230 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
2231 lea 512($context),%rdi # &context.Xmm6
2232 mov \$`$SZ==4?8:12`,%ecx
2233 .long 0xa548f3fc # cld; rep movsq
2238 mov %rax,152($context) # restore context->Rsp
2239 mov %rsi,168($context) # restore context->Rsi
2240 mov %rdi,176($context) # restore context->Rdi
2242 mov 40($disp),%rdi # disp->ContextRecord
2243 mov $context,%rsi # context
2244 mov \$154,%ecx # sizeof(CONTEXT)
2245 .long 0xa548f3fc # cld; rep movsq
2248 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2249 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2250 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2251 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2252 mov 40(%rsi),%r10 # disp->ContextRecord
2253 lea 56(%rsi),%r11 # &disp->HandlerData
2254 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2255 mov %r10,32(%rsp) # arg5
2256 mov %r11,40(%rsp) # arg6
2257 mov %r12,48(%rsp) # arg7
2258 mov %rcx,56(%rsp) # arg8, (NULL)
2259 call *__imp_RtlVirtualUnwind(%rip)
2261 mov \$1,%eax # ExceptionContinueSearch
2273 .size se_handler,.-se_handler
2276 $code.=<<___ if ($SZ==4 && $shaext);
2277 .type shaext_handler,\@abi-omnipotent
2291 mov 120($context),%rax # pull context->Rax
2292 mov 248($context),%rbx # pull context->Rip
2294 lea .Lprologue_shaext(%rip),%r10
2295 cmp %r10,%rbx # context->Rip<.Lprologue
2298 lea .Lepilogue_shaext(%rip),%r10
2299 cmp %r10,%rbx # context->Rip>=.Lepilogue
2302 lea -8-5*16(%rax),%rsi
2303 lea 512($context),%rdi # &context.Xmm6
2305 .long 0xa548f3fc # cld; rep movsq
2308 .size shaext_handler,.-shaext_handler
2314 .rva .LSEH_begin_$func
2315 .rva .LSEH_end_$func
2316 .rva .LSEH_info_$func
2318 $code.=<<___ if ($SZ==4 && $shaext);
2319 .rva .LSEH_begin_${func}_shaext
2320 .rva .LSEH_end_${func}_shaext
2321 .rva .LSEH_info_${func}_shaext
2323 $code.=<<___ if ($SZ==4);
2324 .rva .LSEH_begin_${func}_ssse3
2325 .rva .LSEH_end_${func}_ssse3
2326 .rva .LSEH_info_${func}_ssse3
2328 $code.=<<___ if ($avx && $SZ==8);
2329 .rva .LSEH_begin_${func}_xop
2330 .rva .LSEH_end_${func}_xop
2331 .rva .LSEH_info_${func}_xop
2333 $code.=<<___ if ($avx);
2334 .rva .LSEH_begin_${func}_avx
2335 .rva .LSEH_end_${func}_avx
2336 .rva .LSEH_info_${func}_avx
2338 $code.=<<___ if ($avx>1);
2339 .rva .LSEH_begin_${func}_avx2
2340 .rva .LSEH_end_${func}_avx2
2341 .rva .LSEH_info_${func}_avx2
2349 .rva .Lprologue,.Lepilogue # HandlerData[]
2351 $code.=<<___ if ($SZ==4 && $shaext);
2352 .LSEH_info_${func}_shaext:
2356 $code.=<<___ if ($SZ==4);
2357 .LSEH_info_${func}_ssse3:
2360 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
2362 $code.=<<___ if ($avx && $SZ==8);
2363 .LSEH_info_${func}_xop:
2366 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
2368 $code.=<<___ if ($avx);
2369 .LSEH_info_${func}_avx:
2372 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
2374 $code.=<<___ if ($avx>1);
2375 .LSEH_info_${func}_avx2:
2378 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2385 "sha256rnds2" => 0xcb,
2386 "sha256msg1" => 0xcc,
2387 "sha256msg2" => 0xcd );
2389 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2390 my @opcode=(0x0f,0x38);
2391 push @opcode,$opcodelet{$instr};
2392 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
2393 return ".byte\t".join(',',@opcode);
2395 return $instr."\t".@_[0];
2399 foreach (split("\n",$code)) {
2400 s/\`([^\`]*)\`/eval $1/geo;
2402 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;