2 # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. Rights for redistribution and usage in source and binary
13 # forms are granted according to the OpenSSL license.
14 # ====================================================================
16 # sha256/512_block procedure for x86_64.
18 # 40% improvement over compiler-generated code on Opteron. On EM64T
19 # sha256 was observed to run >80% faster and sha512 - >40%. No magical
20 # tricks, just straight implementation... I really wonder why gcc
21 # [being armed with inline assembler] fails to generate as fast code.
22 # The only thing which is cool about this module is that it's very
23 # same instruction sequence used for both SHA-256 and SHA-512. In
24 # former case the instructions operate on 32-bit operands, while in
25 # latter - on 64-bit ones. All I had to do is to get one flavor right,
26 # the other one passed the test right away:-)
28 # sha256_block runs in ~1005 cycles on Opteron, which gives you
29 # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
30 # frequency in GHz. sha512_block runs in ~1275 cycles, which results
31 # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
32 # Well, if you compare it to IA-64 implementation, which maintains
33 # X[16] in register bank[!], tends to 4 instructions per CPU clock
34 # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
35 # issue Opteron pipeline and X[16] maintained in memory. So that *if*
36 # there is a way to improve it, *then* the only way would be to try to
37 # offload X[16] updates to SSE unit, but that would require "deeper"
38 # loop unroll, which in turn would naturally cause size blow-up, not
39 # to mention increased complexity! And once again, only *if* it's
40 # actually possible to noticeably improve overall ILP, instruction
41 # level parallelism, on a given CPU implementation in this case.
43 # Special note on Intel EM64T. While Opteron CPU exhibits perfect
44 # performance ratio of 1.5 between 64- and 32-bit flavors [see above],
45 # [currently available] EM64T CPUs apparently are far from it. On the
46 # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
47 # sha256_block:-( This is presumably because 64-bit shifts/rotates
48 # apparently are not atomic instructions, but implemented in microcode.
52 # Optimization including one of Pavel Semjanov's ideas, alternative
53 # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
54 # unfortunately -2% SHA512 on P4 [which nobody should care about
59 # Add SIMD code paths, see below for improvement coefficients. SSSE3
60 # code path was not attempted for SHA512, because improvement is not
61 # estimated to be high enough, noticeably less than 9%, to justify
62 # the effort, not on pre-AVX processors. [Obviously with exclusion
63 # for VIA Nano, but it has SHA512 instruction that is faster and
64 # should be used instead.] For reference, corresponding estimated
65 # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
66 # higher coefficients are observed on VIA Nano and Bulldozer has more
67 # to do with specifics of their architecture [which is topic for
68 # separate discussion].
72 # Add AVX2 code path. Two consecutive input blocks are loaded to
73 # 256-bit %ymm registers, with data from first block to least
74 # significant 128-bit halves and data from second to most significant.
75 # The data is then processed with same SIMD instruction sequence as
76 # for AVX, but with %ymm as operands. Side effect is increased stack
77 # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
82 # Add support for Intel SHA Extensions.
84 ######################################################################
85 # Current performance in cycles per processed byte (less is better):
87 # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
89 # AMD K8 14.9 - - 9.57 -
91 # Core 2 15.6 13.8(+13%) - 9.97 -
92 # Westmere 14.8 12.3(+19%) - 9.58 -
93 # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
94 # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
95 # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
96 # Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%)
97 # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
98 # Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%)
99 # VIA Nano 23.0 16.5(+39%) - 14.7 -
100 # Atom 23.0 18.9(+22%) - 14.7 -
101 # Silvermont 27.4 20.6(+33%) - 17.5 -
102 # Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%)
103 # Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 -
105 # (*) whichever best applicable, including SHAEXT;
106 # (**) switch from ror to shrd stands for fair share of improvement;
107 # (***) execution time is fully determined by remaining integer-only
108 # part, body_00_15; reducing the amount of SIMD instructions
109 # below certain limit makes no difference/sense; to conserve
110 # space SHA256 XOP code path is therefore omitted;
114 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
116 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
118 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
119 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
120 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
121 die "can't locate x86_64-xlate.pl";
123 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
124 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
125 $avx = ($1>=2.19) + ($1>=2.22);
128 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
129 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
130 $avx = ($1>=2.09) + ($1>=2.10);
133 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
134 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
135 $avx = ($1>=10) + ($1>=11);
138 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
139 $avx = ($2>=3.0) + ($2>3.0);
142 $shaext=1; ### set to zero if compiling for 1.0.1
143 $avx=1 if (!$shaext && $avx);
145 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
148 if ($output =~ /512/) {
149 $func="sha512_block_data_order";
152 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
153 "%r8", "%r9", "%r10","%r11");
154 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
161 $func="sha256_block_data_order";
164 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
165 "%r8d","%r9d","%r10d","%r11d");
166 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
174 $ctx="%rdi"; # 1st arg, zapped by $a3
175 $inp="%rsi"; # 2nd arg
178 $_ctx="16*$SZ+0*8(%rsp)";
179 $_inp="16*$SZ+1*8(%rsp)";
180 $_end="16*$SZ+2*8(%rsp)";
181 $_rsp="`16*$SZ+3*8`(%rsp)";
182 $framesz="16*$SZ+4*8";
186 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
188 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
191 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
195 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
198 mov $T1,`$SZ*($i&0xf)`(%rsp)
202 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
204 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
206 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
208 add $a2,$T1 # T1+=Ch(e,f,g)
211 add ($Tbl),$T1 # T1+=K[round]
214 xor $b,$a2 # a^b, b^c in next round
215 ror \$$Sigma1[0],$a0 # Sigma1(e)
219 ror \$$Sigma0[0],$a1 # Sigma0(a)
220 add $a0,$T1 # T1+=Sigma1(e)
222 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
226 lea $STRIDE($Tbl),$Tbl # round++
228 $code.=<<___ if ($i<15);
229 add $a1,$h # h+=Sigma0(a)
231 ($a2,$a3) = ($a3,$a2);
235 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
238 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
239 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
242 ror \$`$sigma0[1]-$sigma0[0]`,$a0
243 add $a1,$a # modulo-scheduled h+=Sigma0(a)
245 ror \$`$sigma1[1]-$sigma1[0]`,$a2
254 xor $a0,$T1 # sigma0(X[(i+1)&0xf])
255 xor $a1,$a2 # sigma1(X[(i+14)&0xf])
256 add `$SZ*(($i+9)&0xf)`(%rsp),$T1
258 add `$SZ*($i&0xf)`(%rsp),$T1
269 .extern OPENSSL_ia32cap_P
271 .type $func,\@function,3
276 $code.=<<___ if ($SZ==4 || $avx);
277 lea OPENSSL_ia32cap_P(%rip),%r11
282 $code.=<<___ if ($SZ==4 && $shaext);
283 test \$`1<<29`,%r11d # check for SHA
286 $code.=<<___ if ($avx && $SZ==8);
287 test \$`1<<11`,%r10d # check for XOP
290 $code.=<<___ if ($avx>1);
291 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
292 cmp \$`1<<8|1<<5|1<<3`,%r11d
295 $code.=<<___ if ($avx);
296 and \$`1<<30`,%r9d # mask "Intel CPU" bit
297 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
299 cmp \$`1<<28|1<<9|1<<30`,%r10d
302 $code.=<<___ if ($SZ==4);
307 mov %rsp,%rax # copy %rsp
308 .cfi_def_cfa_register %rax
321 shl \$4,%rdx # num*16
323 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
324 and \$-64,%rsp # align stack frame
325 mov $ctx,$_ctx # save ctx, 1st arg
326 mov $inp,$_inp # save inp, 2nd arh
327 mov %rdx,$_end # save end pointer, "3rd" arg
328 mov %rax,$_rsp # save copy of %rsp
329 .cfi_cfa_expression $_rsp,deref,+8
345 lea $TABLE(%rip),$Tbl
348 for($i=0;$i<16;$i++) {
349 $code.=" mov $SZ*$i($inp),$T1\n";
350 $code.=" mov @ROT[4],$a0\n";
351 $code.=" mov @ROT[0],$a1\n";
352 $code.=" bswap $T1\n";
353 &ROUND_00_15($i,@ROT);
354 unshift(@ROT,pop(@ROT));
362 &ROUND_16_XX($i,@ROT);
363 unshift(@ROT,pop(@ROT));
367 cmpb \$0,`$SZ-1`($Tbl)
371 add $a1,$A # modulo-scheduled h+=Sigma0(a)
372 lea 16*$SZ($inp),$inp
410 .cfi_def_cfa_register %rsp
420 .type $TABLE,\@object
422 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
423 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
424 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
425 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
426 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
427 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
428 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
429 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
430 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
431 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
432 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
433 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
434 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
435 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
436 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
437 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
438 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
439 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
440 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
441 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
442 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
443 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
444 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
445 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
446 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
447 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
448 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
449 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
450 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
451 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
452 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
453 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
455 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
456 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
457 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
458 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
459 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
460 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
461 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
466 .type $TABLE,\@object
468 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
469 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
470 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
471 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
472 .quad 0x3956c25bf348b538,0x59f111f1b605d019
473 .quad 0x3956c25bf348b538,0x59f111f1b605d019
474 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
475 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
476 .quad 0xd807aa98a3030242,0x12835b0145706fbe
477 .quad 0xd807aa98a3030242,0x12835b0145706fbe
478 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
479 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
480 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
481 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
482 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
483 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
484 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
485 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
486 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
487 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
488 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
489 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
490 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
491 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
492 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
493 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
494 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
495 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
496 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
497 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
498 .quad 0x06ca6351e003826f,0x142929670a0e6e70
499 .quad 0x06ca6351e003826f,0x142929670a0e6e70
500 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
501 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
502 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
503 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
504 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
505 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
506 .quad 0x81c2c92e47edaee6,0x92722c851482353b
507 .quad 0x81c2c92e47edaee6,0x92722c851482353b
508 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
509 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
510 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
511 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
512 .quad 0xd192e819d6ef5218,0xd69906245565a910
513 .quad 0xd192e819d6ef5218,0xd69906245565a910
514 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
515 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
516 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
517 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
518 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
519 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
520 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
521 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
522 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
523 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
524 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
525 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
526 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
527 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
528 .quad 0x90befffa23631e28,0xa4506cebde82bde9
529 .quad 0x90befffa23631e28,0xa4506cebde82bde9
530 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
531 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
532 .quad 0xca273eceea26619c,0xd186b8c721c0c207
533 .quad 0xca273eceea26619c,0xd186b8c721c0c207
534 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
535 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
536 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
537 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
538 .quad 0x113f9804bef90dae,0x1b710b35131c471b
539 .quad 0x113f9804bef90dae,0x1b710b35131c471b
540 .quad 0x28db77f523047d84,0x32caab7b40c72493
541 .quad 0x28db77f523047d84,0x32caab7b40c72493
542 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
543 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
544 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
545 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
546 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
547 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
549 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
550 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
551 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
555 ######################################################################
558 if ($SZ==4 && $shaext) {{{
559 ######################################################################
560 # Intel SHA Extensions implementation of SHA256 update function.
562 my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
564 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
565 my @MSG=map("%xmm$_",(3..6));
568 .type sha256_block_data_order_shaext,\@function,3
570 sha256_block_data_order_shaext:
573 $code.=<<___ if ($win64);
574 lea `-8-5*16`(%rsp),%rsp
575 movaps %xmm6,-8-5*16(%rax)
576 movaps %xmm7,-8-4*16(%rax)
577 movaps %xmm8,-8-3*16(%rax)
578 movaps %xmm9,-8-2*16(%rax)
579 movaps %xmm10,-8-1*16(%rax)
583 lea K256+0x80(%rip),$Tbl
584 movdqu ($ctx),$ABEF # DCBA
585 movdqu 16($ctx),$CDGH # HGFE
586 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
588 pshufd \$0x1b,$ABEF,$Wi # ABCD
589 pshufd \$0xb1,$ABEF,$ABEF # CDAB
590 pshufd \$0x1b,$CDGH,$CDGH # EFGH
591 movdqa $TMP,$BSWAP # offload
592 palignr \$8,$CDGH,$ABEF # ABEF
593 punpcklqdq $Wi,$CDGH # CDGH
598 movdqu ($inp),@MSG[0]
599 movdqu 0x10($inp),@MSG[1]
600 movdqu 0x20($inp),@MSG[2]
602 movdqu 0x30($inp),@MSG[3]
604 movdqa 0*32-0x80($Tbl),$Wi
607 movdqa $CDGH,$CDGH_SAVE # offload
608 sha256rnds2 $ABEF,$CDGH # 0-3
609 pshufd \$0x0e,$Wi,$Wi
611 movdqa $ABEF,$ABEF_SAVE # offload
612 sha256rnds2 $CDGH,$ABEF
614 movdqa 1*32-0x80($Tbl),$Wi
617 sha256rnds2 $ABEF,$CDGH # 4-7
618 pshufd \$0x0e,$Wi,$Wi
620 sha256msg1 @MSG[1],@MSG[0]
621 sha256rnds2 $CDGH,$ABEF
623 movdqa 2*32-0x80($Tbl),$Wi
626 sha256rnds2 $ABEF,$CDGH # 8-11
627 pshufd \$0x0e,$Wi,$Wi
629 palignr \$4,@MSG[2],$TMP
632 sha256msg1 @MSG[2],@MSG[1]
633 sha256rnds2 $CDGH,$ABEF
635 movdqa 3*32-0x80($Tbl),$Wi
637 sha256msg2 @MSG[3],@MSG[0]
638 sha256rnds2 $ABEF,$CDGH # 12-15
639 pshufd \$0x0e,$Wi,$Wi
641 palignr \$4,@MSG[3],$TMP
644 sha256msg1 @MSG[3],@MSG[2]
645 sha256rnds2 $CDGH,$ABEF
647 for($i=4;$i<16-3;$i++) {
649 movdqa $i*32-0x80($Tbl),$Wi
651 sha256msg2 @MSG[0],@MSG[1]
652 sha256rnds2 $ABEF,$CDGH # 16-19...
653 pshufd \$0x0e,$Wi,$Wi
655 palignr \$4,@MSG[0],$TMP
658 sha256msg1 @MSG[0],@MSG[3]
659 sha256rnds2 $CDGH,$ABEF
661 push(@MSG,shift(@MSG));
664 movdqa 13*32-0x80($Tbl),$Wi
666 sha256msg2 @MSG[0],@MSG[1]
667 sha256rnds2 $ABEF,$CDGH # 52-55
668 pshufd \$0x0e,$Wi,$Wi
670 palignr \$4,@MSG[0],$TMP
671 sha256rnds2 $CDGH,$ABEF
674 movdqa 14*32-0x80($Tbl),$Wi
676 sha256rnds2 $ABEF,$CDGH # 56-59
677 pshufd \$0x0e,$Wi,$Wi
678 sha256msg2 @MSG[1],@MSG[2]
680 sha256rnds2 $CDGH,$ABEF
682 movdqa 15*32-0x80($Tbl),$Wi
685 sha256rnds2 $ABEF,$CDGH # 60-63
686 pshufd \$0x0e,$Wi,$Wi
689 sha256rnds2 $CDGH,$ABEF
691 paddd $CDGH_SAVE,$CDGH
692 paddd $ABEF_SAVE,$ABEF
695 pshufd \$0xb1,$CDGH,$CDGH # DCHG
696 pshufd \$0x1b,$ABEF,$TMP # FEBA
697 pshufd \$0xb1,$ABEF,$ABEF # BAFE
698 punpckhqdq $CDGH,$ABEF # DCBA
699 palignr \$8,$TMP,$CDGH # HGFE
702 movdqu $CDGH,16($ctx)
704 $code.=<<___ if ($win64);
705 movaps -8-5*16(%rax),%xmm6
706 movaps -8-4*16(%rax),%xmm7
707 movaps -8-3*16(%rax),%xmm8
708 movaps -8-2*16(%rax),%xmm9
709 movaps -8-1*16(%rax),%xmm10
715 .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
721 my ($a,$b,$c,$d,$e,$f,$g,$h);
723 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
724 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
726 $arg = "\$$arg" if ($arg*1 eq $arg);
727 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
732 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
734 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
738 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
740 '&xor ($a4,$g)', # f^g
742 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
744 '&and ($a4,$e)', # (f^g)&e
747 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
750 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
751 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
752 '&xor ($a2,$b)', # a^b, b^c in next round
754 '&add ($h,$a4)', # h+=Ch(e,f,g)
755 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
756 '&and ($a3,$a2)', # (b^c)&(a^b)
759 '&add ($h,$a0)', # h+=Sigma1(e)
760 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
762 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
763 '&add ($d,$h)', # d+=h
764 '&add ($h,$a3)', # h+=Maj(a,b,c)
767 '&add ($a1,$h);'. # h+=Sigma0(a)
768 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
772 ######################################################################
775 if ($SZ==4) { # SHA256 only
776 my @X = map("%xmm$_",(0..3));
777 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
780 .type ${func}_ssse3,\@function,3
785 mov %rsp,%rax # copy %rsp
786 .cfi_def_cfa_register %rax
799 shl \$4,%rdx # num*16
800 sub \$`$framesz+$win64*16*4`,%rsp
801 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
802 and \$-64,%rsp # align stack frame
803 mov $ctx,$_ctx # save ctx, 1st arg
804 mov $inp,$_inp # save inp, 2nd arh
805 mov %rdx,$_end # save end pointer, "3rd" arg
806 mov %rax,$_rsp # save copy of %rsp
807 .cfi_cfa_expression $_rsp,deref,+8
809 $code.=<<___ if ($win64);
810 movaps %xmm6,16*$SZ+32(%rsp)
811 movaps %xmm7,16*$SZ+48(%rsp)
812 movaps %xmm8,16*$SZ+64(%rsp)
813 movaps %xmm9,16*$SZ+80(%rsp)
829 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
830 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
834 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
835 movdqu 0x00($inp),@X[0]
836 movdqu 0x10($inp),@X[1]
837 movdqu 0x20($inp),@X[2]
839 movdqu 0x30($inp),@X[3]
840 lea $TABLE(%rip),$Tbl
842 movdqa 0x00($Tbl),$t0
843 movdqa 0x20($Tbl),$t1
846 movdqa 0x40($Tbl),$t2
848 movdqa 0x60($Tbl),$t3
852 movdqa $t0,0x00(%rsp)
854 movdqa $t1,0x10(%rsp)
856 movdqa $t2,0x20(%rsp)
858 movdqa $t3,0x30(%rsp)
864 sub \$`-16*2*$SZ`,$Tbl # size optimization
866 sub Xupdate_256_SSSE3 () {
868 '&movdqa ($t0,@X[1]);',
869 '&movdqa ($t3,@X[3])',
870 '&palignr ($t0,@X[0],$SZ)', # X[1..4]
871 '&palignr ($t3,@X[2],$SZ);', # X[9..12]
873 '&movdqa ($t2,$t0);',
874 '&psrld ($t0,$sigma0[2])',
875 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
876 '&psrld ($t2,$sigma0[0])',
877 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
878 '&pslld ($t1,8*$SZ-$sigma0[1]);'.
880 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
882 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
885 '&pxor ($t0,$t1);', # sigma0(X[1..4])
886 '&psrld ($t3,$sigma1[2])',
887 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
888 '&psrlq ($t2,$sigma1[0])',
890 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
892 '&pshufb ($t3,$t4)', # sigma1(X[14..15])
893 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
894 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
895 '&movdqa ($t2,$t3);',
896 '&psrld ($t3,$sigma1[2])',
897 '&psrlq ($t2,$sigma1[0])',
899 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
901 '&movdqa ($t2,16*2*$j."($Tbl)")',
903 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
907 sub SSSE3_256_00_47 () {
911 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
914 foreach (Xupdate_256_SSSE3()) { # 36 instructions
920 } else { # squeeze extra 4% on Westmere and 19% on Atom
921 eval(shift(@insns)); #@
926 eval(shift(@insns)); #@
929 eval(shift(@insns)); #@
931 &palignr ($t0,@X[0],$SZ); # X[1..4]
934 &palignr ($t3,@X[2],$SZ); # X[9..12]
938 eval(shift(@insns)); #@
943 eval(shift(@insns)); #@
945 &psrld ($t0,$sigma0[2]);
949 &paddd (@X[0],$t3); # X[0..3] += X[9..12]
950 eval(shift(@insns)); #@
952 &psrld ($t2,$sigma0[0]);
955 &pshufd ($t3,@X[3],0b11111010); # X[4..15]
957 eval(shift(@insns)); #@
958 &pslld ($t1,8*$SZ-$sigma0[1]);
962 eval(shift(@insns)); #@
965 eval(shift(@insns)); #@
966 &psrld ($t2,$sigma0[1]-$sigma0[0]);
971 &pslld ($t1,$sigma0[1]-$sigma0[0]);
976 eval(shift(@insns)); #@
980 &pxor ($t0,$t1); # sigma0(X[1..4])
981 eval(shift(@insns)); #@
984 &psrld ($t3,$sigma1[2]);
987 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
988 eval(shift(@insns)); #@
990 &psrlq ($t2,$sigma1[0]);
995 eval(shift(@insns)); #@
998 eval(shift(@insns)); #@
999 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
1000 eval(shift(@insns));
1001 eval(shift(@insns));
1003 eval(shift(@insns)); #@
1004 eval(shift(@insns));
1005 eval(shift(@insns));
1006 #&pshufb ($t3,$t4); # sigma1(X[14..15])
1007 &pshufd ($t3,$t3,0b10000000);
1008 eval(shift(@insns));
1009 eval(shift(@insns));
1010 eval(shift(@insns));
1012 eval(shift(@insns));
1013 eval(shift(@insns)); #@
1014 eval(shift(@insns));
1015 eval(shift(@insns));
1016 eval(shift(@insns)); #@
1017 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1018 eval(shift(@insns));
1019 eval(shift(@insns));
1020 eval(shift(@insns));
1021 &pshufd ($t3,@X[0],0b01010000); # X[16..17]
1022 eval(shift(@insns));
1023 eval(shift(@insns)); #@
1024 eval(shift(@insns));
1026 eval(shift(@insns));
1027 eval(shift(@insns));
1028 &psrld ($t3,$sigma1[2]);
1029 eval(shift(@insns));
1030 eval(shift(@insns)); #@
1031 &psrlq ($t2,$sigma1[0]);
1032 eval(shift(@insns));
1033 eval(shift(@insns));
1035 eval(shift(@insns)); #@
1036 eval(shift(@insns));
1037 eval(shift(@insns));
1038 eval(shift(@insns)); #@
1039 eval(shift(@insns));
1040 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
1041 eval(shift(@insns));
1042 eval(shift(@insns));
1043 eval(shift(@insns));
1045 eval(shift(@insns));
1046 eval(shift(@insns));
1047 eval(shift(@insns)); #@
1049 &pshufd ($t3,$t3,0b00001000);
1050 eval(shift(@insns));
1051 eval(shift(@insns));
1052 &movdqa ($t2,16*2*$j."($Tbl)");
1053 eval(shift(@insns)); #@
1054 eval(shift(@insns));
1056 eval(shift(@insns));
1057 eval(shift(@insns));
1058 eval(shift(@insns));
1059 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1060 eval(shift(@insns)); #@
1061 eval(shift(@insns));
1062 eval(shift(@insns));
1065 foreach (@insns) { eval; } # remaining instructions
1066 &movdqa (16*$j."(%rsp)",$t2);
1069 for ($i=0,$j=0; $j<4; $j++) {
1070 &SSSE3_256_00_47($j,\&body_00_15,@X);
1071 push(@X,shift(@X)); # rotate(@X)
1073 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1074 &jne (".Lssse3_00_47");
1076 for ($i=0; $i<16; ) {
1077 foreach(body_00_15()) { eval; }
1084 lea 16*$SZ($inp),$inp
1108 $code.=<<___ if ($win64);
1109 movaps 16*$SZ+32(%rsp),%xmm6
1110 movaps 16*$SZ+48(%rsp),%xmm7
1111 movaps 16*$SZ+64(%rsp),%xmm8
1112 movaps 16*$SZ+80(%rsp),%xmm9
1128 .cfi_def_cfa_register %rsp
1132 .size ${func}_ssse3,.-${func}_ssse3
1137 ######################################################################
1140 if ($SZ==8) { # SHA512 only
1142 .type ${func}_xop,\@function,3
1147 mov %rsp,%rax # copy %rsp
1148 .cfi_def_cfa_register %rax
1161 shl \$4,%rdx # num*16
1162 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1163 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1164 and \$-64,%rsp # align stack frame
1165 mov $ctx,$_ctx # save ctx, 1st arg
1166 mov $inp,$_inp # save inp, 2nd arh
1167 mov %rdx,$_end # save end pointer, "3rd" arg
1168 mov %rax,$_rsp # save copy of %rsp
1169 .cfi_cfa_expression $_rsp,deref,+8
1171 $code.=<<___ if ($win64);
1172 movaps %xmm6,16*$SZ+32(%rsp)
1173 movaps %xmm7,16*$SZ+48(%rsp)
1174 movaps %xmm8,16*$SZ+64(%rsp)
1175 movaps %xmm9,16*$SZ+80(%rsp)
1177 $code.=<<___ if ($win64 && $SZ>4);
1178 movaps %xmm10,16*$SZ+96(%rsp)
1179 movaps %xmm11,16*$SZ+112(%rsp)
1195 if ($SZ==4) { # SHA256
1196 my @X = map("%xmm$_",(0..3));
1197 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1202 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1203 vmovdqu 0x00($inp),@X[0]
1204 vmovdqu 0x10($inp),@X[1]
1205 vmovdqu 0x20($inp),@X[2]
1206 vmovdqu 0x30($inp),@X[3]
1207 vpshufb $t3,@X[0],@X[0]
1208 lea $TABLE(%rip),$Tbl
1209 vpshufb $t3,@X[1],@X[1]
1210 vpshufb $t3,@X[2],@X[2]
1211 vpaddd 0x00($Tbl),@X[0],$t0
1212 vpshufb $t3,@X[3],@X[3]
1213 vpaddd 0x20($Tbl),@X[1],$t1
1214 vpaddd 0x40($Tbl),@X[2],$t2
1215 vpaddd 0x60($Tbl),@X[3],$t3
1216 vmovdqa $t0,0x00(%rsp)
1218 vmovdqa $t1,0x10(%rsp)
1220 vmovdqa $t2,0x20(%rsp)
1222 vmovdqa $t3,0x30(%rsp)
1228 sub \$`-16*2*$SZ`,$Tbl # size optimization
1230 sub XOP_256_00_47 () {
1234 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1236 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
1237 eval(shift(@insns));
1238 eval(shift(@insns));
1239 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
1240 eval(shift(@insns));
1241 eval(shift(@insns));
1242 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
1243 eval(shift(@insns));
1244 eval(shift(@insns));
1245 &vpsrld ($t0,$t0,$sigma0[2]);
1246 eval(shift(@insns));
1247 eval(shift(@insns));
1248 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
1249 eval(shift(@insns));
1250 eval(shift(@insns));
1251 eval(shift(@insns));
1252 eval(shift(@insns));
1253 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
1254 eval(shift(@insns));
1255 eval(shift(@insns));
1256 &vpxor ($t0,$t0,$t1);
1257 eval(shift(@insns));
1258 eval(shift(@insns));
1259 eval(shift(@insns));
1260 eval(shift(@insns));
1261 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
1262 eval(shift(@insns));
1263 eval(shift(@insns));
1264 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
1265 eval(shift(@insns));
1266 eval(shift(@insns));
1267 &vpsrld ($t2,@X[3],$sigma1[2]);
1268 eval(shift(@insns));
1269 eval(shift(@insns));
1270 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
1271 eval(shift(@insns));
1272 eval(shift(@insns));
1273 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1274 eval(shift(@insns));
1275 eval(shift(@insns));
1276 &vpxor ($t3,$t3,$t2);
1277 eval(shift(@insns));
1278 eval(shift(@insns));
1279 eval(shift(@insns));
1280 eval(shift(@insns));
1281 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1282 eval(shift(@insns));
1283 eval(shift(@insns));
1284 eval(shift(@insns));
1285 eval(shift(@insns));
1286 &vpsrldq ($t3,$t3,8);
1287 eval(shift(@insns));
1288 eval(shift(@insns));
1289 eval(shift(@insns));
1290 eval(shift(@insns));
1291 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1292 eval(shift(@insns));
1293 eval(shift(@insns));
1294 eval(shift(@insns));
1295 eval(shift(@insns));
1296 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
1297 eval(shift(@insns));
1298 eval(shift(@insns));
1299 &vpsrld ($t2,@X[0],$sigma1[2]);
1300 eval(shift(@insns));
1301 eval(shift(@insns));
1302 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1303 eval(shift(@insns));
1304 eval(shift(@insns));
1305 &vpxor ($t3,$t3,$t2);
1306 eval(shift(@insns));
1307 eval(shift(@insns));
1308 eval(shift(@insns));
1309 eval(shift(@insns));
1310 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
1311 eval(shift(@insns));
1312 eval(shift(@insns));
1313 eval(shift(@insns));
1314 eval(shift(@insns));
1315 &vpslldq ($t3,$t3,8); # 22 instructions
1316 eval(shift(@insns));
1317 eval(shift(@insns));
1318 eval(shift(@insns));
1319 eval(shift(@insns));
1320 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1321 eval(shift(@insns));
1322 eval(shift(@insns));
1323 eval(shift(@insns));
1324 eval(shift(@insns));
1325 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1326 foreach (@insns) { eval; } # remaining instructions
1327 &vmovdqa (16*$j."(%rsp)",$t2);
1330 for ($i=0,$j=0; $j<4; $j++) {
1331 &XOP_256_00_47($j,\&body_00_15,@X);
1332 push(@X,shift(@X)); # rotate(@X)
1334 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1335 &jne (".Lxop_00_47");
1337 for ($i=0; $i<16; ) {
1338 foreach(body_00_15()) { eval; }
1342 my @X = map("%xmm$_",(0..7));
1343 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1348 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1349 vmovdqu 0x00($inp),@X[0]
1350 lea $TABLE+0x80(%rip),$Tbl # size optimization
1351 vmovdqu 0x10($inp),@X[1]
1352 vmovdqu 0x20($inp),@X[2]
1353 vpshufb $t3,@X[0],@X[0]
1354 vmovdqu 0x30($inp),@X[3]
1355 vpshufb $t3,@X[1],@X[1]
1356 vmovdqu 0x40($inp),@X[4]
1357 vpshufb $t3,@X[2],@X[2]
1358 vmovdqu 0x50($inp),@X[5]
1359 vpshufb $t3,@X[3],@X[3]
1360 vmovdqu 0x60($inp),@X[6]
1361 vpshufb $t3,@X[4],@X[4]
1362 vmovdqu 0x70($inp),@X[7]
1363 vpshufb $t3,@X[5],@X[5]
1364 vpaddq -0x80($Tbl),@X[0],$t0
1365 vpshufb $t3,@X[6],@X[6]
1366 vpaddq -0x60($Tbl),@X[1],$t1
1367 vpshufb $t3,@X[7],@X[7]
1368 vpaddq -0x40($Tbl),@X[2],$t2
1369 vpaddq -0x20($Tbl),@X[3],$t3
1370 vmovdqa $t0,0x00(%rsp)
1371 vpaddq 0x00($Tbl),@X[4],$t0
1372 vmovdqa $t1,0x10(%rsp)
1373 vpaddq 0x20($Tbl),@X[5],$t1
1374 vmovdqa $t2,0x20(%rsp)
1375 vpaddq 0x40($Tbl),@X[6],$t2
1376 vmovdqa $t3,0x30(%rsp)
1377 vpaddq 0x60($Tbl),@X[7],$t3
1378 vmovdqa $t0,0x40(%rsp)
1380 vmovdqa $t1,0x50(%rsp)
1382 vmovdqa $t2,0x60(%rsp)
1384 vmovdqa $t3,0x70(%rsp)
1390 add \$`16*2*$SZ`,$Tbl
1392 sub XOP_512_00_47 () {
1396 my @insns = (&$body,&$body); # 52 instructions
1398 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
1399 eval(shift(@insns));
1400 eval(shift(@insns));
1401 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
1402 eval(shift(@insns));
1403 eval(shift(@insns));
1404 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
1405 eval(shift(@insns));
1406 eval(shift(@insns));
1407 &vpsrlq ($t0,$t0,$sigma0[2]);
1408 eval(shift(@insns));
1409 eval(shift(@insns));
1410 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
1411 eval(shift(@insns));
1412 eval(shift(@insns));
1413 eval(shift(@insns));
1414 eval(shift(@insns));
1415 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
1416 eval(shift(@insns));
1417 eval(shift(@insns));
1418 &vpxor ($t0,$t0,$t1);
1419 eval(shift(@insns));
1420 eval(shift(@insns));
1421 eval(shift(@insns));
1422 eval(shift(@insns));
1423 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
1424 eval(shift(@insns));
1425 eval(shift(@insns));
1426 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
1427 eval(shift(@insns));
1428 eval(shift(@insns));
1429 &vpsrlq ($t2,@X[7],$sigma1[2]);
1430 eval(shift(@insns));
1431 eval(shift(@insns));
1432 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
1433 eval(shift(@insns));
1434 eval(shift(@insns));
1435 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
1436 eval(shift(@insns));
1437 eval(shift(@insns));
1438 &vpxor ($t3,$t3,$t2);
1439 eval(shift(@insns));
1440 eval(shift(@insns));
1441 eval(shift(@insns));
1442 eval(shift(@insns));
1443 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1444 eval(shift(@insns));
1445 eval(shift(@insns));
1446 eval(shift(@insns));
1447 eval(shift(@insns));
1448 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1449 eval(shift(@insns));
1450 eval(shift(@insns));
1451 eval(shift(@insns));
1452 eval(shift(@insns));
1453 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1454 foreach (@insns) { eval; } # remaining instructions
1455 &vmovdqa (16*$j."(%rsp)",$t2);
1458 for ($i=0,$j=0; $j<8; $j++) {
1459 &XOP_512_00_47($j,\&body_00_15,@X);
1460 push(@X,shift(@X)); # rotate(@X)
1462 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1463 &jne (".Lxop_00_47");
1465 for ($i=0; $i<16; ) {
1466 foreach(body_00_15()) { eval; }
1474 lea 16*$SZ($inp),$inp
1499 $code.=<<___ if ($win64);
1500 movaps 16*$SZ+32(%rsp),%xmm6
1501 movaps 16*$SZ+48(%rsp),%xmm7
1502 movaps 16*$SZ+64(%rsp),%xmm8
1503 movaps 16*$SZ+80(%rsp),%xmm9
1505 $code.=<<___ if ($win64 && $SZ>4);
1506 movaps 16*$SZ+96(%rsp),%xmm10
1507 movaps 16*$SZ+112(%rsp),%xmm11
1523 .cfi_def_cfa_register %rsp
1527 .size ${func}_xop,.-${func}_xop
1530 ######################################################################
1531 # AVX+shrd code path
1533 local *ror = sub { &shrd(@_[0],@_) };
1536 .type ${func}_avx,\@function,3
1541 mov %rsp,%rax # copy %rsp
1542 .cfi_def_cfa_register %rax
1555 shl \$4,%rdx # num*16
1556 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1557 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1558 and \$-64,%rsp # align stack frame
1559 mov $ctx,$_ctx # save ctx, 1st arg
1560 mov $inp,$_inp # save inp, 2nd arh
1561 mov %rdx,$_end # save end pointer, "3rd" arg
1562 mov %rax,$_rsp # save copy of %rsp
1563 .cfi_cfa_expression $_rsp,deref,+8
1565 $code.=<<___ if ($win64);
1566 movaps %xmm6,16*$SZ+32(%rsp)
1567 movaps %xmm7,16*$SZ+48(%rsp)
1568 movaps %xmm8,16*$SZ+64(%rsp)
1569 movaps %xmm9,16*$SZ+80(%rsp)
1571 $code.=<<___ if ($win64 && $SZ>4);
1572 movaps %xmm10,16*$SZ+96(%rsp)
1573 movaps %xmm11,16*$SZ+112(%rsp)
1588 if ($SZ==4) { # SHA256
1589 my @X = map("%xmm$_",(0..3));
1590 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1593 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1594 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1598 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1599 vmovdqu 0x00($inp),@X[0]
1600 vmovdqu 0x10($inp),@X[1]
1601 vmovdqu 0x20($inp),@X[2]
1602 vmovdqu 0x30($inp),@X[3]
1603 vpshufb $t3,@X[0],@X[0]
1604 lea $TABLE(%rip),$Tbl
1605 vpshufb $t3,@X[1],@X[1]
1606 vpshufb $t3,@X[2],@X[2]
1607 vpaddd 0x00($Tbl),@X[0],$t0
1608 vpshufb $t3,@X[3],@X[3]
1609 vpaddd 0x20($Tbl),@X[1],$t1
1610 vpaddd 0x40($Tbl),@X[2],$t2
1611 vpaddd 0x60($Tbl),@X[3],$t3
1612 vmovdqa $t0,0x00(%rsp)
1614 vmovdqa $t1,0x10(%rsp)
1616 vmovdqa $t2,0x20(%rsp)
1618 vmovdqa $t3,0x30(%rsp)
1624 sub \$`-16*2*$SZ`,$Tbl # size optimization
1626 sub Xupdate_256_AVX () {
1628 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
1629 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
1630 '&vpsrld ($t2,$t0,$sigma0[0]);',
1631 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
1632 '&vpsrld ($t3,$t0,$sigma0[2])',
1633 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
1634 '&vpxor ($t0,$t3,$t2)',
1635 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
1636 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1637 '&vpxor ($t0,$t0,$t1)',
1638 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1639 '&vpxor ($t0,$t0,$t2)',
1640 '&vpsrld ($t2,$t3,$sigma1[2]);',
1641 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
1642 '&vpsrlq ($t3,$t3,$sigma1[0]);',
1643 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
1644 '&vpxor ($t2,$t2,$t3);',
1645 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1646 '&vpxor ($t2,$t2,$t3)',
1647 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
1648 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
1649 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
1650 '&vpsrld ($t2,$t3,$sigma1[2])',
1651 '&vpsrlq ($t3,$t3,$sigma1[0])',
1652 '&vpxor ($t2,$t2,$t3);',
1653 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1654 '&vpxor ($t2,$t2,$t3)',
1655 '&vpshufb ($t2,$t2,$t5)',
1656 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
1660 sub AVX_256_00_47 () {
1664 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1666 foreach (Xupdate_256_AVX()) { # 29 instructions
1668 eval(shift(@insns));
1669 eval(shift(@insns));
1670 eval(shift(@insns));
1672 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1673 foreach (@insns) { eval; } # remaining instructions
1674 &vmovdqa (16*$j."(%rsp)",$t2);
1677 for ($i=0,$j=0; $j<4; $j++) {
1678 &AVX_256_00_47($j,\&body_00_15,@X);
1679 push(@X,shift(@X)); # rotate(@X)
1681 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1682 &jne (".Lavx_00_47");
1684 for ($i=0; $i<16; ) {
1685 foreach(body_00_15()) { eval; }
1689 my @X = map("%xmm$_",(0..7));
1690 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1696 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1697 vmovdqu 0x00($inp),@X[0]
1698 lea $TABLE+0x80(%rip),$Tbl # size optimization
1699 vmovdqu 0x10($inp),@X[1]
1700 vmovdqu 0x20($inp),@X[2]
1701 vpshufb $t3,@X[0],@X[0]
1702 vmovdqu 0x30($inp),@X[3]
1703 vpshufb $t3,@X[1],@X[1]
1704 vmovdqu 0x40($inp),@X[4]
1705 vpshufb $t3,@X[2],@X[2]
1706 vmovdqu 0x50($inp),@X[5]
1707 vpshufb $t3,@X[3],@X[3]
1708 vmovdqu 0x60($inp),@X[6]
1709 vpshufb $t3,@X[4],@X[4]
1710 vmovdqu 0x70($inp),@X[7]
1711 vpshufb $t3,@X[5],@X[5]
1712 vpaddq -0x80($Tbl),@X[0],$t0
1713 vpshufb $t3,@X[6],@X[6]
1714 vpaddq -0x60($Tbl),@X[1],$t1
1715 vpshufb $t3,@X[7],@X[7]
1716 vpaddq -0x40($Tbl),@X[2],$t2
1717 vpaddq -0x20($Tbl),@X[3],$t3
1718 vmovdqa $t0,0x00(%rsp)
1719 vpaddq 0x00($Tbl),@X[4],$t0
1720 vmovdqa $t1,0x10(%rsp)
1721 vpaddq 0x20($Tbl),@X[5],$t1
1722 vmovdqa $t2,0x20(%rsp)
1723 vpaddq 0x40($Tbl),@X[6],$t2
1724 vmovdqa $t3,0x30(%rsp)
1725 vpaddq 0x60($Tbl),@X[7],$t3
1726 vmovdqa $t0,0x40(%rsp)
1728 vmovdqa $t1,0x50(%rsp)
1730 vmovdqa $t2,0x60(%rsp)
1732 vmovdqa $t3,0x70(%rsp)
1738 add \$`16*2*$SZ`,$Tbl
1740 sub Xupdate_512_AVX () {
1742 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
1743 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
1744 '&vpsrlq ($t2,$t0,$sigma0[0])',
1745 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
1746 '&vpsrlq ($t3,$t0,$sigma0[2])',
1747 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
1748 '&vpxor ($t0,$t3,$t2)',
1749 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1750 '&vpxor ($t0,$t0,$t1)',
1751 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1752 '&vpxor ($t0,$t0,$t2)',
1753 '&vpsrlq ($t3,@X[7],$sigma1[2]);',
1754 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
1755 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
1756 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
1757 '&vpsrlq ($t1,@X[7],$sigma1[0]);',
1758 '&vpxor ($t3,$t3,$t2)',
1759 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
1760 '&vpxor ($t3,$t3,$t1)',
1761 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
1762 '&vpxor ($t3,$t3,$t2)',
1763 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
1764 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
1768 sub AVX_512_00_47 () {
1772 my @insns = (&$body,&$body); # 52 instructions
1774 foreach (Xupdate_512_AVX()) { # 23 instructions
1776 eval(shift(@insns));
1777 eval(shift(@insns));
1779 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1780 foreach (@insns) { eval; } # remaining instructions
1781 &vmovdqa (16*$j."(%rsp)",$t2);
1784 for ($i=0,$j=0; $j<8; $j++) {
1785 &AVX_512_00_47($j,\&body_00_15,@X);
1786 push(@X,shift(@X)); # rotate(@X)
1788 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1789 &jne (".Lavx_00_47");
1791 for ($i=0; $i<16; ) {
1792 foreach(body_00_15()) { eval; }
1800 lea 16*$SZ($inp),$inp
1825 $code.=<<___ if ($win64);
1826 movaps 16*$SZ+32(%rsp),%xmm6
1827 movaps 16*$SZ+48(%rsp),%xmm7
1828 movaps 16*$SZ+64(%rsp),%xmm8
1829 movaps 16*$SZ+80(%rsp),%xmm9
1831 $code.=<<___ if ($win64 && $SZ>4);
1832 movaps 16*$SZ+96(%rsp),%xmm10
1833 movaps 16*$SZ+112(%rsp),%xmm11
1849 .cfi_def_cfa_register %rsp
1853 .size ${func}_avx,.-${func}_avx
1857 ######################################################################
1858 # AVX2+BMI code path
1860 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
1864 sub bodyx_00_15 () {
1865 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1867 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1869 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
1870 '&and ($a4,$e)', # f&e
1871 '&rorx ($a0,$e,$Sigma1[2])',
1872 '&rorx ($a2,$e,$Sigma1[1])',
1874 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
1875 '&lea ($h,"($h,$a4)")',
1876 '&andn ($a4,$e,$g)', # ~e&g
1879 '&rorx ($a1,$e,$Sigma1[0])',
1880 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
1881 '&xor ($a0,$a1)', # Sigma1(e)
1884 '&rorx ($a4,$a,$Sigma0[2])',
1885 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
1886 '&xor ($a2,$b)', # a^b, b^c in next round
1887 '&rorx ($a1,$a,$Sigma0[1])',
1889 '&rorx ($a0,$a,$Sigma0[0])',
1890 '&lea ($d,"($d,$h)")', # d+=h
1891 '&and ($a3,$a2)', # (b^c)&(a^b)
1894 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
1895 '&xor ($a1,$a0)', # Sigma0(a)
1896 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
1897 '&mov ($a4,$e)', # copy of f in future
1899 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1901 # and at the finish one has to $a+=$a1
1905 .type ${func}_avx2,\@function,3
1910 mov %rsp,%rax # copy %rsp
1911 .cfi_def_cfa_register %rax
1924 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1925 shl \$4,%rdx # num*16
1926 and \$-256*$SZ,%rsp # align stack frame
1927 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1928 add \$`2*$SZ*($rounds-8)`,%rsp
1929 mov $ctx,$_ctx # save ctx, 1st arg
1930 mov $inp,$_inp # save inp, 2nd arh
1931 mov %rdx,$_end # save end pointer, "3rd" arg
1932 mov %rax,$_rsp # save copy of %rsp
1933 .cfi_cfa_expression $_rsp,deref,+8
1935 $code.=<<___ if ($win64);
1936 movaps %xmm6,16*$SZ+32(%rsp)
1937 movaps %xmm7,16*$SZ+48(%rsp)
1938 movaps %xmm8,16*$SZ+64(%rsp)
1939 movaps %xmm9,16*$SZ+80(%rsp)
1941 $code.=<<___ if ($win64 && $SZ>4);
1942 movaps %xmm10,16*$SZ+96(%rsp)
1943 movaps %xmm11,16*$SZ+112(%rsp)
1949 sub \$-16*$SZ,$inp # inp++, size optimization
1951 mov $inp,%r12 # borrow $T1
1953 cmp %rdx,$inp # $_end
1955 cmove %rsp,%r12 # next block or random data
1962 if ($SZ==4) { # SHA256
1963 my @X = map("%ymm$_",(0..3));
1964 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1967 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1968 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1972 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1973 vmovdqu -16*$SZ+0($inp),%xmm0
1974 vmovdqu -16*$SZ+16($inp),%xmm1
1975 vmovdqu -16*$SZ+32($inp),%xmm2
1976 vmovdqu -16*$SZ+48($inp),%xmm3
1977 #mov $inp,$_inp # offload $inp
1978 vinserti128 \$1,(%r12),@X[0],@X[0]
1979 vinserti128 \$1,16(%r12),@X[1],@X[1]
1980 vpshufb $t3,@X[0],@X[0]
1981 vinserti128 \$1,32(%r12),@X[2],@X[2]
1982 vpshufb $t3,@X[1],@X[1]
1983 vinserti128 \$1,48(%r12),@X[3],@X[3]
1985 lea $TABLE(%rip),$Tbl
1986 vpshufb $t3,@X[2],@X[2]
1987 vpaddd 0x00($Tbl),@X[0],$t0
1988 vpshufb $t3,@X[3],@X[3]
1989 vpaddd 0x20($Tbl),@X[1],$t1
1990 vpaddd 0x40($Tbl),@X[2],$t2
1991 vpaddd 0x60($Tbl),@X[3],$t3
1992 vmovdqa $t0,0x00(%rsp)
1994 vmovdqa $t1,0x20(%rsp)
1995 lea -$PUSH8(%rsp),%rsp
1997 vmovdqa $t2,0x00(%rsp)
1999 vmovdqa $t3,0x20(%rsp)
2001 sub \$-16*2*$SZ,$Tbl # size optimization
2008 sub AVX2_256_00_47 () {
2012 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
2013 my $base = "+2*$PUSH8(%rsp)";
2015 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
2016 foreach (Xupdate_256_AVX()) { # 29 instructions
2018 eval(shift(@insns));
2019 eval(shift(@insns));
2020 eval(shift(@insns));
2022 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
2023 foreach (@insns) { eval; } # remaining instructions
2024 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2027 for ($i=0,$j=0; $j<4; $j++) {
2028 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
2029 push(@X,shift(@X)); # rotate(@X)
2031 &lea ($Tbl,16*2*$SZ."($Tbl)");
2032 &cmpb (($SZ-1)."($Tbl)",0);
2033 &jne (".Lavx2_00_47");
2035 for ($i=0; $i<16; ) {
2036 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2037 foreach(bodyx_00_15()) { eval; }
2040 my @X = map("%ymm$_",(0..7));
2041 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
2047 vmovdqu -16*$SZ($inp),%xmm0
2048 vmovdqu -16*$SZ+16($inp),%xmm1
2049 vmovdqu -16*$SZ+32($inp),%xmm2
2050 lea $TABLE+0x80(%rip),$Tbl # size optimization
2051 vmovdqu -16*$SZ+48($inp),%xmm3
2052 vmovdqu -16*$SZ+64($inp),%xmm4
2053 vmovdqu -16*$SZ+80($inp),%xmm5
2054 vmovdqu -16*$SZ+96($inp),%xmm6
2055 vmovdqu -16*$SZ+112($inp),%xmm7
2056 #mov $inp,$_inp # offload $inp
2057 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
2058 vinserti128 \$1,(%r12),@X[0],@X[0]
2059 vinserti128 \$1,16(%r12),@X[1],@X[1]
2060 vpshufb $t2,@X[0],@X[0]
2061 vinserti128 \$1,32(%r12),@X[2],@X[2]
2062 vpshufb $t2,@X[1],@X[1]
2063 vinserti128 \$1,48(%r12),@X[3],@X[3]
2064 vpshufb $t2,@X[2],@X[2]
2065 vinserti128 \$1,64(%r12),@X[4],@X[4]
2066 vpshufb $t2,@X[3],@X[3]
2067 vinserti128 \$1,80(%r12),@X[5],@X[5]
2068 vpshufb $t2,@X[4],@X[4]
2069 vinserti128 \$1,96(%r12),@X[6],@X[6]
2070 vpshufb $t2,@X[5],@X[5]
2071 vinserti128 \$1,112(%r12),@X[7],@X[7]
2073 vpaddq -0x80($Tbl),@X[0],$t0
2074 vpshufb $t2,@X[6],@X[6]
2075 vpaddq -0x60($Tbl),@X[1],$t1
2076 vpshufb $t2,@X[7],@X[7]
2077 vpaddq -0x40($Tbl),@X[2],$t2
2078 vpaddq -0x20($Tbl),@X[3],$t3
2079 vmovdqa $t0,0x00(%rsp)
2080 vpaddq 0x00($Tbl),@X[4],$t0
2081 vmovdqa $t1,0x20(%rsp)
2082 vpaddq 0x20($Tbl),@X[5],$t1
2083 vmovdqa $t2,0x40(%rsp)
2084 vpaddq 0x40($Tbl),@X[6],$t2
2085 vmovdqa $t3,0x60(%rsp)
2086 lea -$PUSH8(%rsp),%rsp
2087 vpaddq 0x60($Tbl),@X[7],$t3
2088 vmovdqa $t0,0x00(%rsp)
2090 vmovdqa $t1,0x20(%rsp)
2092 vmovdqa $t2,0x40(%rsp)
2094 vmovdqa $t3,0x60(%rsp)
2103 sub AVX2_512_00_47 () {
2107 my @insns = (&$body,&$body); # 48 instructions
2108 my $base = "+2*$PUSH8(%rsp)";
2110 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0);
2111 foreach (Xupdate_512_AVX()) { # 23 instructions
2114 eval(shift(@insns));
2115 eval(shift(@insns));
2116 eval(shift(@insns));
2119 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
2120 foreach (@insns) { eval; } # remaining instructions
2121 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2124 for ($i=0,$j=0; $j<8; $j++) {
2125 &AVX2_512_00_47($j,\&bodyx_00_15,@X);
2126 push(@X,shift(@X)); # rotate(@X)
2128 &lea ($Tbl,16*2*$SZ."($Tbl)");
2129 &cmpb (($SZ-1-0x80)."($Tbl)",0);
2130 &jne (".Lavx2_00_47");
2132 for ($i=0; $i<16; ) {
2133 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2134 foreach(bodyx_00_15()) { eval; }
2138 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2140 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2141 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
2161 cmp `$PUSH8+2*8`($Tbl),$inp # $_end
2172 for ($i=0; $i<8; ) {
2173 my $base="+16($Tbl)";
2174 foreach(bodyx_00_15()) { eval; }
2177 lea -$PUSH8($Tbl),$Tbl
2181 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2183 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2184 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
2192 lea `2*16*$SZ`($inp),$inp # inp+=2
2199 cmove %rsp,%r12 # next block or stale data
2217 $code.=<<___ if ($win64);
2218 movaps 16*$SZ+32(%rsp),%xmm6
2219 movaps 16*$SZ+48(%rsp),%xmm7
2220 movaps 16*$SZ+64(%rsp),%xmm8
2221 movaps 16*$SZ+80(%rsp),%xmm9
2223 $code.=<<___ if ($win64 && $SZ>4);
2224 movaps 16*$SZ+96(%rsp),%xmm10
2225 movaps 16*$SZ+112(%rsp),%xmm11
2241 .cfi_def_cfa_register %rsp
2245 .size ${func}_avx2,.-${func}_avx2
2250 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2251 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2259 .extern __imp_RtlVirtualUnwind
2260 .type se_handler,\@abi-omnipotent
2274 mov 120($context),%rax # pull context->Rax
2275 mov 248($context),%rbx # pull context->Rip
2277 mov 8($disp),%rsi # disp->ImageBase
2278 mov 56($disp),%r11 # disp->HanderlData
2280 mov 0(%r11),%r10d # HandlerData[0]
2281 lea (%rsi,%r10),%r10 # prologue label
2282 cmp %r10,%rbx # context->Rip<prologue label
2285 mov 152($context),%rax # pull context->Rsp
2287 mov 4(%r11),%r10d # HandlerData[1]
2288 lea (%rsi,%r10),%r10 # epilogue label
2289 cmp %r10,%rbx # context->Rip>=epilogue label
2292 $code.=<<___ if ($avx>1);
2293 lea .Lavx2_shortcut(%rip),%r10
2294 cmp %r10,%rbx # context->Rip<avx2_shortcut
2298 add \$`2*$SZ*($rounds-8)`,%rax
2302 mov %rax,%rsi # put aside Rsp
2303 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
2311 mov %rbx,144($context) # restore context->Rbx
2312 mov %rbp,160($context) # restore context->Rbp
2313 mov %r12,216($context) # restore context->R12
2314 mov %r13,224($context) # restore context->R13
2315 mov %r14,232($context) # restore context->R14
2316 mov %r15,240($context) # restore context->R15
2318 lea .Lepilogue(%rip),%r10
2320 jb .Lin_prologue # non-AVX code
2322 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
2323 lea 512($context),%rdi # &context.Xmm6
2324 mov \$`$SZ==4?8:12`,%ecx
2325 .long 0xa548f3fc # cld; rep movsq
2330 mov %rax,152($context) # restore context->Rsp
2331 mov %rsi,168($context) # restore context->Rsi
2332 mov %rdi,176($context) # restore context->Rdi
2334 mov 40($disp),%rdi # disp->ContextRecord
2335 mov $context,%rsi # context
2336 mov \$154,%ecx # sizeof(CONTEXT)
2337 .long 0xa548f3fc # cld; rep movsq
2340 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2341 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2342 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2343 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2344 mov 40(%rsi),%r10 # disp->ContextRecord
2345 lea 56(%rsi),%r11 # &disp->HandlerData
2346 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2347 mov %r10,32(%rsp) # arg5
2348 mov %r11,40(%rsp) # arg6
2349 mov %r12,48(%rsp) # arg7
2350 mov %rcx,56(%rsp) # arg8, (NULL)
2351 call *__imp_RtlVirtualUnwind(%rip)
2353 mov \$1,%eax # ExceptionContinueSearch
2365 .size se_handler,.-se_handler
2368 $code.=<<___ if ($SZ==4 && $shaext);
2369 .type shaext_handler,\@abi-omnipotent
2383 mov 120($context),%rax # pull context->Rax
2384 mov 248($context),%rbx # pull context->Rip
2386 lea .Lprologue_shaext(%rip),%r10
2387 cmp %r10,%rbx # context->Rip<.Lprologue
2390 lea .Lepilogue_shaext(%rip),%r10
2391 cmp %r10,%rbx # context->Rip>=.Lepilogue
2394 lea -8-5*16(%rax),%rsi
2395 lea 512($context),%rdi # &context.Xmm6
2397 .long 0xa548f3fc # cld; rep movsq
2400 .size shaext_handler,.-shaext_handler
2406 .rva .LSEH_begin_$func
2407 .rva .LSEH_end_$func
2408 .rva .LSEH_info_$func
2410 $code.=<<___ if ($SZ==4 && $shaext);
2411 .rva .LSEH_begin_${func}_shaext
2412 .rva .LSEH_end_${func}_shaext
2413 .rva .LSEH_info_${func}_shaext
2415 $code.=<<___ if ($SZ==4);
2416 .rva .LSEH_begin_${func}_ssse3
2417 .rva .LSEH_end_${func}_ssse3
2418 .rva .LSEH_info_${func}_ssse3
2420 $code.=<<___ if ($avx && $SZ==8);
2421 .rva .LSEH_begin_${func}_xop
2422 .rva .LSEH_end_${func}_xop
2423 .rva .LSEH_info_${func}_xop
2425 $code.=<<___ if ($avx);
2426 .rva .LSEH_begin_${func}_avx
2427 .rva .LSEH_end_${func}_avx
2428 .rva .LSEH_info_${func}_avx
2430 $code.=<<___ if ($avx>1);
2431 .rva .LSEH_begin_${func}_avx2
2432 .rva .LSEH_end_${func}_avx2
2433 .rva .LSEH_info_${func}_avx2
2441 .rva .Lprologue,.Lepilogue # HandlerData[]
2443 $code.=<<___ if ($SZ==4 && $shaext);
2444 .LSEH_info_${func}_shaext:
2448 $code.=<<___ if ($SZ==4);
2449 .LSEH_info_${func}_ssse3:
2452 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
2454 $code.=<<___ if ($avx && $SZ==8);
2455 .LSEH_info_${func}_xop:
2458 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
2460 $code.=<<___ if ($avx);
2461 .LSEH_info_${func}_avx:
2464 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
2466 $code.=<<___ if ($avx>1);
2467 .LSEH_info_${func}_avx2:
2470 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2477 "sha256rnds2" => 0xcb,
2478 "sha256msg1" => 0xcc,
2479 "sha256msg2" => 0xcd );
2481 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2482 my @opcode=(0x0f,0x38);
2483 push @opcode,$opcodelet{$instr};
2484 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
2485 return ".byte\t".join(',',@opcode);
2487 return $instr."\t".@_[0];
2491 foreach (split("\n",$code)) {
2492 s/\`([^\`]*)\`/eval $1/geo;
2494 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;