2 # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. Rights for redistribution and usage in source and binary
13 # forms are granted according to the OpenSSL license.
14 # ====================================================================
16 # sha256/512_block procedure for x86_64.
18 # 40% improvement over compiler-generated code on Opteron. On EM64T
19 # sha256 was observed to run >80% faster and sha512 - >40%. No magical
20 # tricks, just straight implementation... I really wonder why gcc
21 # [being armed with inline assembler] fails to generate as fast code.
22 # The only thing which is cool about this module is that it's very
23 # same instruction sequence used for both SHA-256 and SHA-512. In
24 # former case the instructions operate on 32-bit operands, while in
25 # latter - on 64-bit ones. All I had to do is to get one flavor right,
26 # the other one passed the test right away:-)
28 # sha256_block runs in ~1005 cycles on Opteron, which gives you
29 # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
30 # frequency in GHz. sha512_block runs in ~1275 cycles, which results
31 # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
32 # Well, if you compare it to IA-64 implementation, which maintains
33 # X[16] in register bank[!], tends to 4 instructions per CPU clock
34 # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
35 # issue Opteron pipeline and X[16] maintained in memory. So that *if*
36 # there is a way to improve it, *then* the only way would be to try to
37 # offload X[16] updates to SSE unit, but that would require "deeper"
38 # loop unroll, which in turn would naturally cause size blow-up, not
39 # to mention increased complexity! And once again, only *if* it's
40 # actually possible to noticeably improve overall ILP, instruction
41 # level parallelism, on a given CPU implementation in this case.
43 # Special note on Intel EM64T. While Opteron CPU exhibits perfect
44 # performance ratio of 1.5 between 64- and 32-bit flavors [see above],
45 # [currently available] EM64T CPUs apparently are far from it. On the
46 # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
47 # sha256_block:-( This is presumably because 64-bit shifts/rotates
48 # apparently are not atomic instructions, but implemented in microcode.
52 # Optimization including one of Pavel Semjanov's ideas, alternative
53 # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
54 # unfortunately -2% SHA512 on P4 [which nobody should care about
59 # Add SIMD code paths, see below for improvement coefficients. SSSE3
60 # code path was not attempted for SHA512, because improvement is not
61 # estimated to be high enough, noticeably less than 9%, to justify
62 # the effort, not on pre-AVX processors. [Obviously with exclusion
63 # for VIA Nano, but it has SHA512 instruction that is faster and
64 # should be used instead.] For reference, corresponding estimated
65 # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
66 # higher coefficients are observed on VIA Nano and Bulldozer has more
67 # to do with specifics of their architecture [which is topic for
68 # separate discussion].
72 # Add AVX2 code path. Two consecutive input blocks are loaded to
73 # 256-bit %ymm registers, with data from first block to least
74 # significant 128-bit halves and data from second to most significant.
75 # The data is then processed with same SIMD instruction sequence as
76 # for AVX, but with %ymm as operands. Side effect is increased stack
77 # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
82 # Add support for Intel SHA Extensions.
84 ######################################################################
85 # Current performance in cycles per processed byte (less is better):
87 # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
89 # AMD K8 14.9 - - 9.57 -
91 # Core 2 15.6 13.8(+13%) - 9.97 -
92 # Westmere 14.8 12.3(+19%) - 9.58 -
93 # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
94 # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
95 # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
96 # Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%)
97 # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
98 # Ryzen 11.0 ? 2.05(+440%) 7.05 5.67(+20%)
99 # VIA Nano 23.0 16.5(+39%) - 14.7 -
100 # Atom 23.0 18.9(+22%) - 14.7 -
101 # Silvermont 27.4 20.6(+33%) - 17.5 -
102 # Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 -
104 # (*) whichever best applicable, including SHAEXT;
105 # (**) switch from ror to shrd stands for fair share of improvement;
106 # (***) execution time is fully determined by remaining integer-only
107 # part, body_00_15; reducing the amount of SIMD instructions
108 # below certain limit makes no difference/sense; to conserve
109 # space SHA256 XOP code path is therefore omitted;
113 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
115 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
117 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
118 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
119 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
120 die "can't locate x86_64-xlate.pl";
122 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
123 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
124 $avx = ($1>=2.19) + ($1>=2.22);
127 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
128 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
129 $avx = ($1>=2.09) + ($1>=2.10);
132 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
133 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
134 $avx = ($1>=10) + ($1>=11);
137 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
138 $avx = ($2>=3.0) + ($2>3.0);
141 $shaext=1; ### set to zero if compiling for 1.0.1
142 $avx=1 if (!$shaext && $avx);
144 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
147 if ($output =~ /512/) {
148 $func="sha512_block_data_order";
151 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
152 "%r8", "%r9", "%r10","%r11");
153 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
160 $func="sha256_block_data_order";
163 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
164 "%r8d","%r9d","%r10d","%r11d");
165 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
173 $ctx="%rdi"; # 1st arg, zapped by $a3
174 $inp="%rsi"; # 2nd arg
177 $_ctx="16*$SZ+0*8(%rsp)";
178 $_inp="16*$SZ+1*8(%rsp)";
179 $_end="16*$SZ+2*8(%rsp)";
180 $_rsp="`16*$SZ+3*8`(%rsp)";
181 $framesz="16*$SZ+4*8";
185 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
187 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
190 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
194 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
197 mov $T1,`$SZ*($i&0xf)`(%rsp)
201 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
203 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
205 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
207 add $a2,$T1 # T1+=Ch(e,f,g)
210 add ($Tbl),$T1 # T1+=K[round]
213 xor $b,$a2 # a^b, b^c in next round
214 ror \$$Sigma1[0],$a0 # Sigma1(e)
218 ror \$$Sigma0[0],$a1 # Sigma0(a)
219 add $a0,$T1 # T1+=Sigma1(e)
221 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
225 lea $STRIDE($Tbl),$Tbl # round++
227 $code.=<<___ if ($i<15);
228 add $a1,$h # h+=Sigma0(a)
230 ($a2,$a3) = ($a3,$a2);
234 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
237 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
238 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
241 ror \$`$sigma0[1]-$sigma0[0]`,$a0
242 add $a1,$a # modulo-scheduled h+=Sigma0(a)
244 ror \$`$sigma1[1]-$sigma1[0]`,$a2
253 xor $a0,$T1 # sigma0(X[(i+1)&0xf])
254 xor $a1,$a2 # sigma1(X[(i+14)&0xf])
255 add `$SZ*(($i+9)&0xf)`(%rsp),$T1
257 add `$SZ*($i&0xf)`(%rsp),$T1
268 .extern OPENSSL_ia32cap_P
270 .type $func,\@function,3
275 $code.=<<___ if ($SZ==4 || $avx);
276 lea OPENSSL_ia32cap_P(%rip),%r11
281 $code.=<<___ if ($SZ==4 && $shaext);
282 test \$`1<<29`,%r11d # check for SHA
285 $code.=<<___ if ($avx && $SZ==8);
286 test \$`1<<11`,%r10d # check for XOP
289 $code.=<<___ if ($avx>1);
290 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
291 cmp \$`1<<8|1<<5|1<<3`,%r11d
294 $code.=<<___ if ($avx);
295 and \$`1<<30`,%r9d # mask "Intel CPU" bit
296 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
298 cmp \$`1<<28|1<<9|1<<30`,%r10d
301 $code.=<<___ if ($SZ==4);
306 mov %rsp,%rax # copy %rsp
307 .cfi_def_cfa_register %rax
320 shl \$4,%rdx # num*16
322 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
323 and \$-64,%rsp # align stack frame
324 mov $ctx,$_ctx # save ctx, 1st arg
325 mov $inp,$_inp # save inp, 2nd arh
326 mov %rdx,$_end # save end pointer, "3rd" arg
327 mov %rax,$_rsp # save copy of %rsp
328 .cfi_cfa_expression $_rsp,deref,+8
344 lea $TABLE(%rip),$Tbl
347 for($i=0;$i<16;$i++) {
348 $code.=" mov $SZ*$i($inp),$T1\n";
349 $code.=" mov @ROT[4],$a0\n";
350 $code.=" mov @ROT[0],$a1\n";
351 $code.=" bswap $T1\n";
352 &ROUND_00_15($i,@ROT);
353 unshift(@ROT,pop(@ROT));
361 &ROUND_16_XX($i,@ROT);
362 unshift(@ROT,pop(@ROT));
366 cmpb \$0,`$SZ-1`($Tbl)
370 add $a1,$A # modulo-scheduled h+=Sigma0(a)
371 lea 16*$SZ($inp),$inp
409 .cfi_def_cfa_register %rsp
419 .type $TABLE,\@object
421 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
422 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
423 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
424 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
425 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
426 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
427 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
428 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
429 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
430 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
431 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
432 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
433 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
434 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
435 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
436 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
437 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
438 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
439 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
440 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
441 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
442 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
443 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
444 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
445 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
446 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
447 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
448 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
449 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
450 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
451 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
452 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
454 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
455 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
456 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
457 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
458 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
459 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
460 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
465 .type $TABLE,\@object
467 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
468 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
469 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
470 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
471 .quad 0x3956c25bf348b538,0x59f111f1b605d019
472 .quad 0x3956c25bf348b538,0x59f111f1b605d019
473 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
474 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
475 .quad 0xd807aa98a3030242,0x12835b0145706fbe
476 .quad 0xd807aa98a3030242,0x12835b0145706fbe
477 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
478 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
479 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
480 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
481 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
482 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
483 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
484 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
485 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
486 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
487 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
488 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
489 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
490 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
491 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
492 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
493 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
494 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
495 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
496 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
497 .quad 0x06ca6351e003826f,0x142929670a0e6e70
498 .quad 0x06ca6351e003826f,0x142929670a0e6e70
499 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
500 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
501 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
502 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
503 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
504 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
505 .quad 0x81c2c92e47edaee6,0x92722c851482353b
506 .quad 0x81c2c92e47edaee6,0x92722c851482353b
507 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
508 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
509 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
510 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
511 .quad 0xd192e819d6ef5218,0xd69906245565a910
512 .quad 0xd192e819d6ef5218,0xd69906245565a910
513 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
514 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
515 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
516 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
517 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
518 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
519 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
520 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
521 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
522 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
523 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
524 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
525 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
526 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
527 .quad 0x90befffa23631e28,0xa4506cebde82bde9
528 .quad 0x90befffa23631e28,0xa4506cebde82bde9
529 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
530 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
531 .quad 0xca273eceea26619c,0xd186b8c721c0c207
532 .quad 0xca273eceea26619c,0xd186b8c721c0c207
533 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
534 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
535 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
536 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
537 .quad 0x113f9804bef90dae,0x1b710b35131c471b
538 .quad 0x113f9804bef90dae,0x1b710b35131c471b
539 .quad 0x28db77f523047d84,0x32caab7b40c72493
540 .quad 0x28db77f523047d84,0x32caab7b40c72493
541 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
542 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
543 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
544 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
545 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
546 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
548 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
549 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
550 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
554 ######################################################################
557 if ($SZ==4 && $shaext) {{{
558 ######################################################################
559 # Intel SHA Extensions implementation of SHA256 update function.
561 my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
563 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
564 my @MSG=map("%xmm$_",(3..6));
567 .type sha256_block_data_order_shaext,\@function,3
569 sha256_block_data_order_shaext:
572 $code.=<<___ if ($win64);
573 lea `-8-5*16`(%rsp),%rsp
574 movaps %xmm6,-8-5*16(%rax)
575 movaps %xmm7,-8-4*16(%rax)
576 movaps %xmm8,-8-3*16(%rax)
577 movaps %xmm9,-8-2*16(%rax)
578 movaps %xmm10,-8-1*16(%rax)
582 lea K256+0x80(%rip),$Tbl
583 movdqu ($ctx),$ABEF # DCBA
584 movdqu 16($ctx),$CDGH # HGFE
585 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
587 pshufd \$0x1b,$ABEF,$Wi # ABCD
588 pshufd \$0xb1,$ABEF,$ABEF # CDAB
589 pshufd \$0x1b,$CDGH,$CDGH # EFGH
590 movdqa $TMP,$BSWAP # offload
591 palignr \$8,$CDGH,$ABEF # ABEF
592 punpcklqdq $Wi,$CDGH # CDGH
597 movdqu ($inp),@MSG[0]
598 movdqu 0x10($inp),@MSG[1]
599 movdqu 0x20($inp),@MSG[2]
601 movdqu 0x30($inp),@MSG[3]
603 movdqa 0*32-0x80($Tbl),$Wi
606 movdqa $CDGH,$CDGH_SAVE # offload
607 sha256rnds2 $ABEF,$CDGH # 0-3
608 pshufd \$0x0e,$Wi,$Wi
610 movdqa $ABEF,$ABEF_SAVE # offload
611 sha256rnds2 $CDGH,$ABEF
613 movdqa 1*32-0x80($Tbl),$Wi
616 sha256rnds2 $ABEF,$CDGH # 4-7
617 pshufd \$0x0e,$Wi,$Wi
619 sha256msg1 @MSG[1],@MSG[0]
620 sha256rnds2 $CDGH,$ABEF
622 movdqa 2*32-0x80($Tbl),$Wi
625 sha256rnds2 $ABEF,$CDGH # 8-11
626 pshufd \$0x0e,$Wi,$Wi
628 palignr \$4,@MSG[2],$TMP
631 sha256msg1 @MSG[2],@MSG[1]
632 sha256rnds2 $CDGH,$ABEF
634 movdqa 3*32-0x80($Tbl),$Wi
636 sha256msg2 @MSG[3],@MSG[0]
637 sha256rnds2 $ABEF,$CDGH # 12-15
638 pshufd \$0x0e,$Wi,$Wi
640 palignr \$4,@MSG[3],$TMP
643 sha256msg1 @MSG[3],@MSG[2]
644 sha256rnds2 $CDGH,$ABEF
646 for($i=4;$i<16-3;$i++) {
648 movdqa $i*32-0x80($Tbl),$Wi
650 sha256msg2 @MSG[0],@MSG[1]
651 sha256rnds2 $ABEF,$CDGH # 16-19...
652 pshufd \$0x0e,$Wi,$Wi
654 palignr \$4,@MSG[0],$TMP
657 sha256msg1 @MSG[0],@MSG[3]
658 sha256rnds2 $CDGH,$ABEF
660 push(@MSG,shift(@MSG));
663 movdqa 13*32-0x80($Tbl),$Wi
665 sha256msg2 @MSG[0],@MSG[1]
666 sha256rnds2 $ABEF,$CDGH # 52-55
667 pshufd \$0x0e,$Wi,$Wi
669 palignr \$4,@MSG[0],$TMP
670 sha256rnds2 $CDGH,$ABEF
673 movdqa 14*32-0x80($Tbl),$Wi
675 sha256rnds2 $ABEF,$CDGH # 56-59
676 pshufd \$0x0e,$Wi,$Wi
677 sha256msg2 @MSG[1],@MSG[2]
679 sha256rnds2 $CDGH,$ABEF
681 movdqa 15*32-0x80($Tbl),$Wi
684 sha256rnds2 $ABEF,$CDGH # 60-63
685 pshufd \$0x0e,$Wi,$Wi
688 sha256rnds2 $CDGH,$ABEF
690 paddd $CDGH_SAVE,$CDGH
691 paddd $ABEF_SAVE,$ABEF
694 pshufd \$0xb1,$CDGH,$CDGH # DCHG
695 pshufd \$0x1b,$ABEF,$TMP # FEBA
696 pshufd \$0xb1,$ABEF,$ABEF # BAFE
697 punpckhqdq $CDGH,$ABEF # DCBA
698 palignr \$8,$TMP,$CDGH # HGFE
701 movdqu $CDGH,16($ctx)
703 $code.=<<___ if ($win64);
704 movaps -8-5*16(%rax),%xmm6
705 movaps -8-4*16(%rax),%xmm7
706 movaps -8-3*16(%rax),%xmm8
707 movaps -8-2*16(%rax),%xmm9
708 movaps -8-1*16(%rax),%xmm10
714 .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
720 my ($a,$b,$c,$d,$e,$f,$g,$h);
722 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
723 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
725 $arg = "\$$arg" if ($arg*1 eq $arg);
726 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
731 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
733 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
737 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
739 '&xor ($a4,$g)', # f^g
741 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
743 '&and ($a4,$e)', # (f^g)&e
746 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
749 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
750 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
751 '&xor ($a2,$b)', # a^b, b^c in next round
753 '&add ($h,$a4)', # h+=Ch(e,f,g)
754 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
755 '&and ($a3,$a2)', # (b^c)&(a^b)
758 '&add ($h,$a0)', # h+=Sigma1(e)
759 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
761 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
762 '&add ($d,$h)', # d+=h
763 '&add ($h,$a3)', # h+=Maj(a,b,c)
766 '&add ($a1,$h);'. # h+=Sigma0(a)
767 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
771 ######################################################################
774 if ($SZ==4) { # SHA256 only
775 my @X = map("%xmm$_",(0..3));
776 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
779 .type ${func}_ssse3,\@function,3
784 mov %rsp,%rax # copy %rsp
785 .cfi_def_cfa_register %rax
798 shl \$4,%rdx # num*16
799 sub \$`$framesz+$win64*16*4`,%rsp
800 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
801 and \$-64,%rsp # align stack frame
802 mov $ctx,$_ctx # save ctx, 1st arg
803 mov $inp,$_inp # save inp, 2nd arh
804 mov %rdx,$_end # save end pointer, "3rd" arg
805 mov %rax,$_rsp # save copy of %rsp
806 .cfi_cfa_expression $_rsp,deref,+8
808 $code.=<<___ if ($win64);
809 movaps %xmm6,16*$SZ+32(%rsp)
810 movaps %xmm7,16*$SZ+48(%rsp)
811 movaps %xmm8,16*$SZ+64(%rsp)
812 movaps %xmm9,16*$SZ+80(%rsp)
828 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
829 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
833 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
834 movdqu 0x00($inp),@X[0]
835 movdqu 0x10($inp),@X[1]
836 movdqu 0x20($inp),@X[2]
838 movdqu 0x30($inp),@X[3]
839 lea $TABLE(%rip),$Tbl
841 movdqa 0x00($Tbl),$t0
842 movdqa 0x20($Tbl),$t1
845 movdqa 0x40($Tbl),$t2
847 movdqa 0x60($Tbl),$t3
851 movdqa $t0,0x00(%rsp)
853 movdqa $t1,0x10(%rsp)
855 movdqa $t2,0x20(%rsp)
857 movdqa $t3,0x30(%rsp)
863 sub \$`-16*2*$SZ`,$Tbl # size optimization
865 sub Xupdate_256_SSSE3 () {
867 '&movdqa ($t0,@X[1]);',
868 '&movdqa ($t3,@X[3])',
869 '&palignr ($t0,@X[0],$SZ)', # X[1..4]
870 '&palignr ($t3,@X[2],$SZ);', # X[9..12]
872 '&movdqa ($t2,$t0);',
873 '&psrld ($t0,$sigma0[2])',
874 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
875 '&psrld ($t2,$sigma0[0])',
876 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
877 '&pslld ($t1,8*$SZ-$sigma0[1]);'.
879 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
881 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
884 '&pxor ($t0,$t1);', # sigma0(X[1..4])
885 '&psrld ($t3,$sigma1[2])',
886 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
887 '&psrlq ($t2,$sigma1[0])',
889 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
891 '&pshufb ($t3,$t4)', # sigma1(X[14..15])
892 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
893 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
894 '&movdqa ($t2,$t3);',
895 '&psrld ($t3,$sigma1[2])',
896 '&psrlq ($t2,$sigma1[0])',
898 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
900 '&movdqa ($t2,16*2*$j."($Tbl)")',
902 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
906 sub SSSE3_256_00_47 () {
910 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
913 foreach (Xupdate_256_SSSE3()) { # 36 instructions
919 } else { # squeeze extra 4% on Westmere and 19% on Atom
920 eval(shift(@insns)); #@
925 eval(shift(@insns)); #@
928 eval(shift(@insns)); #@
930 &palignr ($t0,@X[0],$SZ); # X[1..4]
933 &palignr ($t3,@X[2],$SZ); # X[9..12]
937 eval(shift(@insns)); #@
942 eval(shift(@insns)); #@
944 &psrld ($t0,$sigma0[2]);
948 &paddd (@X[0],$t3); # X[0..3] += X[9..12]
949 eval(shift(@insns)); #@
951 &psrld ($t2,$sigma0[0]);
954 &pshufd ($t3,@X[3],0b11111010); # X[4..15]
956 eval(shift(@insns)); #@
957 &pslld ($t1,8*$SZ-$sigma0[1]);
961 eval(shift(@insns)); #@
964 eval(shift(@insns)); #@
965 &psrld ($t2,$sigma0[1]-$sigma0[0]);
970 &pslld ($t1,$sigma0[1]-$sigma0[0]);
975 eval(shift(@insns)); #@
979 &pxor ($t0,$t1); # sigma0(X[1..4])
980 eval(shift(@insns)); #@
983 &psrld ($t3,$sigma1[2]);
986 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
987 eval(shift(@insns)); #@
989 &psrlq ($t2,$sigma1[0]);
994 eval(shift(@insns)); #@
997 eval(shift(@insns)); #@
998 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
1000 eval(shift(@insns));
1002 eval(shift(@insns)); #@
1003 eval(shift(@insns));
1004 eval(shift(@insns));
1005 #&pshufb ($t3,$t4); # sigma1(X[14..15])
1006 &pshufd ($t3,$t3,0b10000000);
1007 eval(shift(@insns));
1008 eval(shift(@insns));
1009 eval(shift(@insns));
1011 eval(shift(@insns));
1012 eval(shift(@insns)); #@
1013 eval(shift(@insns));
1014 eval(shift(@insns));
1015 eval(shift(@insns)); #@
1016 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1017 eval(shift(@insns));
1018 eval(shift(@insns));
1019 eval(shift(@insns));
1020 &pshufd ($t3,@X[0],0b01010000); # X[16..17]
1021 eval(shift(@insns));
1022 eval(shift(@insns)); #@
1023 eval(shift(@insns));
1025 eval(shift(@insns));
1026 eval(shift(@insns));
1027 &psrld ($t3,$sigma1[2]);
1028 eval(shift(@insns));
1029 eval(shift(@insns)); #@
1030 &psrlq ($t2,$sigma1[0]);
1031 eval(shift(@insns));
1032 eval(shift(@insns));
1034 eval(shift(@insns)); #@
1035 eval(shift(@insns));
1036 eval(shift(@insns));
1037 eval(shift(@insns)); #@
1038 eval(shift(@insns));
1039 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
1040 eval(shift(@insns));
1041 eval(shift(@insns));
1042 eval(shift(@insns));
1044 eval(shift(@insns));
1045 eval(shift(@insns));
1046 eval(shift(@insns)); #@
1048 &pshufd ($t3,$t3,0b00001000);
1049 eval(shift(@insns));
1050 eval(shift(@insns));
1051 &movdqa ($t2,16*2*$j."($Tbl)");
1052 eval(shift(@insns)); #@
1053 eval(shift(@insns));
1055 eval(shift(@insns));
1056 eval(shift(@insns));
1057 eval(shift(@insns));
1058 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1059 eval(shift(@insns)); #@
1060 eval(shift(@insns));
1061 eval(shift(@insns));
1064 foreach (@insns) { eval; } # remaining instructions
1065 &movdqa (16*$j."(%rsp)",$t2);
1068 for ($i=0,$j=0; $j<4; $j++) {
1069 &SSSE3_256_00_47($j,\&body_00_15,@X);
1070 push(@X,shift(@X)); # rotate(@X)
1072 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1073 &jne (".Lssse3_00_47");
1075 for ($i=0; $i<16; ) {
1076 foreach(body_00_15()) { eval; }
1083 lea 16*$SZ($inp),$inp
1107 $code.=<<___ if ($win64);
1108 movaps 16*$SZ+32(%rsp),%xmm6
1109 movaps 16*$SZ+48(%rsp),%xmm7
1110 movaps 16*$SZ+64(%rsp),%xmm8
1111 movaps 16*$SZ+80(%rsp),%xmm9
1127 .cfi_def_cfa_register %rsp
1131 .size ${func}_ssse3,.-${func}_ssse3
1136 ######################################################################
1139 if ($SZ==8) { # SHA512 only
1141 .type ${func}_xop,\@function,3
1146 mov %rsp,%rax # copy %rsp
1147 .cfi_def_cfa_register %rax
1160 shl \$4,%rdx # num*16
1161 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1162 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1163 and \$-64,%rsp # align stack frame
1164 mov $ctx,$_ctx # save ctx, 1st arg
1165 mov $inp,$_inp # save inp, 2nd arh
1166 mov %rdx,$_end # save end pointer, "3rd" arg
1167 mov %rax,$_rsp # save copy of %rsp
1168 .cfi_cfa_expression $_rsp,deref,+8
1170 $code.=<<___ if ($win64);
1171 movaps %xmm6,16*$SZ+32(%rsp)
1172 movaps %xmm7,16*$SZ+48(%rsp)
1173 movaps %xmm8,16*$SZ+64(%rsp)
1174 movaps %xmm9,16*$SZ+80(%rsp)
1176 $code.=<<___ if ($win64 && $SZ>4);
1177 movaps %xmm10,16*$SZ+96(%rsp)
1178 movaps %xmm11,16*$SZ+112(%rsp)
1194 if ($SZ==4) { # SHA256
1195 my @X = map("%xmm$_",(0..3));
1196 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1201 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1202 vmovdqu 0x00($inp),@X[0]
1203 vmovdqu 0x10($inp),@X[1]
1204 vmovdqu 0x20($inp),@X[2]
1205 vmovdqu 0x30($inp),@X[3]
1206 vpshufb $t3,@X[0],@X[0]
1207 lea $TABLE(%rip),$Tbl
1208 vpshufb $t3,@X[1],@X[1]
1209 vpshufb $t3,@X[2],@X[2]
1210 vpaddd 0x00($Tbl),@X[0],$t0
1211 vpshufb $t3,@X[3],@X[3]
1212 vpaddd 0x20($Tbl),@X[1],$t1
1213 vpaddd 0x40($Tbl),@X[2],$t2
1214 vpaddd 0x60($Tbl),@X[3],$t3
1215 vmovdqa $t0,0x00(%rsp)
1217 vmovdqa $t1,0x10(%rsp)
1219 vmovdqa $t2,0x20(%rsp)
1221 vmovdqa $t3,0x30(%rsp)
1227 sub \$`-16*2*$SZ`,$Tbl # size optimization
1229 sub XOP_256_00_47 () {
1233 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1235 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
1236 eval(shift(@insns));
1237 eval(shift(@insns));
1238 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
1239 eval(shift(@insns));
1240 eval(shift(@insns));
1241 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
1242 eval(shift(@insns));
1243 eval(shift(@insns));
1244 &vpsrld ($t0,$t0,$sigma0[2]);
1245 eval(shift(@insns));
1246 eval(shift(@insns));
1247 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
1248 eval(shift(@insns));
1249 eval(shift(@insns));
1250 eval(shift(@insns));
1251 eval(shift(@insns));
1252 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
1253 eval(shift(@insns));
1254 eval(shift(@insns));
1255 &vpxor ($t0,$t0,$t1);
1256 eval(shift(@insns));
1257 eval(shift(@insns));
1258 eval(shift(@insns));
1259 eval(shift(@insns));
1260 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
1261 eval(shift(@insns));
1262 eval(shift(@insns));
1263 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
1264 eval(shift(@insns));
1265 eval(shift(@insns));
1266 &vpsrld ($t2,@X[3],$sigma1[2]);
1267 eval(shift(@insns));
1268 eval(shift(@insns));
1269 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
1270 eval(shift(@insns));
1271 eval(shift(@insns));
1272 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1273 eval(shift(@insns));
1274 eval(shift(@insns));
1275 &vpxor ($t3,$t3,$t2);
1276 eval(shift(@insns));
1277 eval(shift(@insns));
1278 eval(shift(@insns));
1279 eval(shift(@insns));
1280 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1281 eval(shift(@insns));
1282 eval(shift(@insns));
1283 eval(shift(@insns));
1284 eval(shift(@insns));
1285 &vpsrldq ($t3,$t3,8);
1286 eval(shift(@insns));
1287 eval(shift(@insns));
1288 eval(shift(@insns));
1289 eval(shift(@insns));
1290 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1291 eval(shift(@insns));
1292 eval(shift(@insns));
1293 eval(shift(@insns));
1294 eval(shift(@insns));
1295 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
1296 eval(shift(@insns));
1297 eval(shift(@insns));
1298 &vpsrld ($t2,@X[0],$sigma1[2]);
1299 eval(shift(@insns));
1300 eval(shift(@insns));
1301 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1302 eval(shift(@insns));
1303 eval(shift(@insns));
1304 &vpxor ($t3,$t3,$t2);
1305 eval(shift(@insns));
1306 eval(shift(@insns));
1307 eval(shift(@insns));
1308 eval(shift(@insns));
1309 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
1310 eval(shift(@insns));
1311 eval(shift(@insns));
1312 eval(shift(@insns));
1313 eval(shift(@insns));
1314 &vpslldq ($t3,$t3,8); # 22 instructions
1315 eval(shift(@insns));
1316 eval(shift(@insns));
1317 eval(shift(@insns));
1318 eval(shift(@insns));
1319 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1320 eval(shift(@insns));
1321 eval(shift(@insns));
1322 eval(shift(@insns));
1323 eval(shift(@insns));
1324 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1325 foreach (@insns) { eval; } # remaining instructions
1326 &vmovdqa (16*$j."(%rsp)",$t2);
1329 for ($i=0,$j=0; $j<4; $j++) {
1330 &XOP_256_00_47($j,\&body_00_15,@X);
1331 push(@X,shift(@X)); # rotate(@X)
1333 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1334 &jne (".Lxop_00_47");
1336 for ($i=0; $i<16; ) {
1337 foreach(body_00_15()) { eval; }
1341 my @X = map("%xmm$_",(0..7));
1342 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1347 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1348 vmovdqu 0x00($inp),@X[0]
1349 lea $TABLE+0x80(%rip),$Tbl # size optimization
1350 vmovdqu 0x10($inp),@X[1]
1351 vmovdqu 0x20($inp),@X[2]
1352 vpshufb $t3,@X[0],@X[0]
1353 vmovdqu 0x30($inp),@X[3]
1354 vpshufb $t3,@X[1],@X[1]
1355 vmovdqu 0x40($inp),@X[4]
1356 vpshufb $t3,@X[2],@X[2]
1357 vmovdqu 0x50($inp),@X[5]
1358 vpshufb $t3,@X[3],@X[3]
1359 vmovdqu 0x60($inp),@X[6]
1360 vpshufb $t3,@X[4],@X[4]
1361 vmovdqu 0x70($inp),@X[7]
1362 vpshufb $t3,@X[5],@X[5]
1363 vpaddq -0x80($Tbl),@X[0],$t0
1364 vpshufb $t3,@X[6],@X[6]
1365 vpaddq -0x60($Tbl),@X[1],$t1
1366 vpshufb $t3,@X[7],@X[7]
1367 vpaddq -0x40($Tbl),@X[2],$t2
1368 vpaddq -0x20($Tbl),@X[3],$t3
1369 vmovdqa $t0,0x00(%rsp)
1370 vpaddq 0x00($Tbl),@X[4],$t0
1371 vmovdqa $t1,0x10(%rsp)
1372 vpaddq 0x20($Tbl),@X[5],$t1
1373 vmovdqa $t2,0x20(%rsp)
1374 vpaddq 0x40($Tbl),@X[6],$t2
1375 vmovdqa $t3,0x30(%rsp)
1376 vpaddq 0x60($Tbl),@X[7],$t3
1377 vmovdqa $t0,0x40(%rsp)
1379 vmovdqa $t1,0x50(%rsp)
1381 vmovdqa $t2,0x60(%rsp)
1383 vmovdqa $t3,0x70(%rsp)
1389 add \$`16*2*$SZ`,$Tbl
1391 sub XOP_512_00_47 () {
1395 my @insns = (&$body,&$body); # 52 instructions
1397 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
1398 eval(shift(@insns));
1399 eval(shift(@insns));
1400 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
1401 eval(shift(@insns));
1402 eval(shift(@insns));
1403 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
1404 eval(shift(@insns));
1405 eval(shift(@insns));
1406 &vpsrlq ($t0,$t0,$sigma0[2]);
1407 eval(shift(@insns));
1408 eval(shift(@insns));
1409 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
1410 eval(shift(@insns));
1411 eval(shift(@insns));
1412 eval(shift(@insns));
1413 eval(shift(@insns));
1414 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
1415 eval(shift(@insns));
1416 eval(shift(@insns));
1417 &vpxor ($t0,$t0,$t1);
1418 eval(shift(@insns));
1419 eval(shift(@insns));
1420 eval(shift(@insns));
1421 eval(shift(@insns));
1422 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
1423 eval(shift(@insns));
1424 eval(shift(@insns));
1425 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
1426 eval(shift(@insns));
1427 eval(shift(@insns));
1428 &vpsrlq ($t2,@X[7],$sigma1[2]);
1429 eval(shift(@insns));
1430 eval(shift(@insns));
1431 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
1432 eval(shift(@insns));
1433 eval(shift(@insns));
1434 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
1435 eval(shift(@insns));
1436 eval(shift(@insns));
1437 &vpxor ($t3,$t3,$t2);
1438 eval(shift(@insns));
1439 eval(shift(@insns));
1440 eval(shift(@insns));
1441 eval(shift(@insns));
1442 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1443 eval(shift(@insns));
1444 eval(shift(@insns));
1445 eval(shift(@insns));
1446 eval(shift(@insns));
1447 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1448 eval(shift(@insns));
1449 eval(shift(@insns));
1450 eval(shift(@insns));
1451 eval(shift(@insns));
1452 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1453 foreach (@insns) { eval; } # remaining instructions
1454 &vmovdqa (16*$j."(%rsp)",$t2);
1457 for ($i=0,$j=0; $j<8; $j++) {
1458 &XOP_512_00_47($j,\&body_00_15,@X);
1459 push(@X,shift(@X)); # rotate(@X)
1461 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1462 &jne (".Lxop_00_47");
1464 for ($i=0; $i<16; ) {
1465 foreach(body_00_15()) { eval; }
1473 lea 16*$SZ($inp),$inp
1498 $code.=<<___ if ($win64);
1499 movaps 16*$SZ+32(%rsp),%xmm6
1500 movaps 16*$SZ+48(%rsp),%xmm7
1501 movaps 16*$SZ+64(%rsp),%xmm8
1502 movaps 16*$SZ+80(%rsp),%xmm9
1504 $code.=<<___ if ($win64 && $SZ>4);
1505 movaps 16*$SZ+96(%rsp),%xmm10
1506 movaps 16*$SZ+112(%rsp),%xmm11
1522 .cfi_def_cfa_register %rsp
1526 .size ${func}_xop,.-${func}_xop
1529 ######################################################################
1530 # AVX+shrd code path
1532 local *ror = sub { &shrd(@_[0],@_) };
1535 .type ${func}_avx,\@function,3
1540 mov %rsp,%rax # copy %rsp
1541 .cfi_def_cfa_register %rax
1554 shl \$4,%rdx # num*16
1555 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1556 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1557 and \$-64,%rsp # align stack frame
1558 mov $ctx,$_ctx # save ctx, 1st arg
1559 mov $inp,$_inp # save inp, 2nd arh
1560 mov %rdx,$_end # save end pointer, "3rd" arg
1561 mov %rax,$_rsp # save copy of %rsp
1562 .cfi_cfa_expression $_rsp,deref,+8
1564 $code.=<<___ if ($win64);
1565 movaps %xmm6,16*$SZ+32(%rsp)
1566 movaps %xmm7,16*$SZ+48(%rsp)
1567 movaps %xmm8,16*$SZ+64(%rsp)
1568 movaps %xmm9,16*$SZ+80(%rsp)
1570 $code.=<<___ if ($win64 && $SZ>4);
1571 movaps %xmm10,16*$SZ+96(%rsp)
1572 movaps %xmm11,16*$SZ+112(%rsp)
1587 if ($SZ==4) { # SHA256
1588 my @X = map("%xmm$_",(0..3));
1589 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1592 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1593 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1597 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1598 vmovdqu 0x00($inp),@X[0]
1599 vmovdqu 0x10($inp),@X[1]
1600 vmovdqu 0x20($inp),@X[2]
1601 vmovdqu 0x30($inp),@X[3]
1602 vpshufb $t3,@X[0],@X[0]
1603 lea $TABLE(%rip),$Tbl
1604 vpshufb $t3,@X[1],@X[1]
1605 vpshufb $t3,@X[2],@X[2]
1606 vpaddd 0x00($Tbl),@X[0],$t0
1607 vpshufb $t3,@X[3],@X[3]
1608 vpaddd 0x20($Tbl),@X[1],$t1
1609 vpaddd 0x40($Tbl),@X[2],$t2
1610 vpaddd 0x60($Tbl),@X[3],$t3
1611 vmovdqa $t0,0x00(%rsp)
1613 vmovdqa $t1,0x10(%rsp)
1615 vmovdqa $t2,0x20(%rsp)
1617 vmovdqa $t3,0x30(%rsp)
1623 sub \$`-16*2*$SZ`,$Tbl # size optimization
1625 sub Xupdate_256_AVX () {
1627 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
1628 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
1629 '&vpsrld ($t2,$t0,$sigma0[0]);',
1630 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
1631 '&vpsrld ($t3,$t0,$sigma0[2])',
1632 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
1633 '&vpxor ($t0,$t3,$t2)',
1634 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
1635 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1636 '&vpxor ($t0,$t0,$t1)',
1637 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1638 '&vpxor ($t0,$t0,$t2)',
1639 '&vpsrld ($t2,$t3,$sigma1[2]);',
1640 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
1641 '&vpsrlq ($t3,$t3,$sigma1[0]);',
1642 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
1643 '&vpxor ($t2,$t2,$t3);',
1644 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1645 '&vpxor ($t2,$t2,$t3)',
1646 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
1647 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
1648 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
1649 '&vpsrld ($t2,$t3,$sigma1[2])',
1650 '&vpsrlq ($t3,$t3,$sigma1[0])',
1651 '&vpxor ($t2,$t2,$t3);',
1652 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1653 '&vpxor ($t2,$t2,$t3)',
1654 '&vpshufb ($t2,$t2,$t5)',
1655 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
1659 sub AVX_256_00_47 () {
1663 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1665 foreach (Xupdate_256_AVX()) { # 29 instructions
1667 eval(shift(@insns));
1668 eval(shift(@insns));
1669 eval(shift(@insns));
1671 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1672 foreach (@insns) { eval; } # remaining instructions
1673 &vmovdqa (16*$j."(%rsp)",$t2);
1676 for ($i=0,$j=0; $j<4; $j++) {
1677 &AVX_256_00_47($j,\&body_00_15,@X);
1678 push(@X,shift(@X)); # rotate(@X)
1680 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1681 &jne (".Lavx_00_47");
1683 for ($i=0; $i<16; ) {
1684 foreach(body_00_15()) { eval; }
1688 my @X = map("%xmm$_",(0..7));
1689 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1695 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1696 vmovdqu 0x00($inp),@X[0]
1697 lea $TABLE+0x80(%rip),$Tbl # size optimization
1698 vmovdqu 0x10($inp),@X[1]
1699 vmovdqu 0x20($inp),@X[2]
1700 vpshufb $t3,@X[0],@X[0]
1701 vmovdqu 0x30($inp),@X[3]
1702 vpshufb $t3,@X[1],@X[1]
1703 vmovdqu 0x40($inp),@X[4]
1704 vpshufb $t3,@X[2],@X[2]
1705 vmovdqu 0x50($inp),@X[5]
1706 vpshufb $t3,@X[3],@X[3]
1707 vmovdqu 0x60($inp),@X[6]
1708 vpshufb $t3,@X[4],@X[4]
1709 vmovdqu 0x70($inp),@X[7]
1710 vpshufb $t3,@X[5],@X[5]
1711 vpaddq -0x80($Tbl),@X[0],$t0
1712 vpshufb $t3,@X[6],@X[6]
1713 vpaddq -0x60($Tbl),@X[1],$t1
1714 vpshufb $t3,@X[7],@X[7]
1715 vpaddq -0x40($Tbl),@X[2],$t2
1716 vpaddq -0x20($Tbl),@X[3],$t3
1717 vmovdqa $t0,0x00(%rsp)
1718 vpaddq 0x00($Tbl),@X[4],$t0
1719 vmovdqa $t1,0x10(%rsp)
1720 vpaddq 0x20($Tbl),@X[5],$t1
1721 vmovdqa $t2,0x20(%rsp)
1722 vpaddq 0x40($Tbl),@X[6],$t2
1723 vmovdqa $t3,0x30(%rsp)
1724 vpaddq 0x60($Tbl),@X[7],$t3
1725 vmovdqa $t0,0x40(%rsp)
1727 vmovdqa $t1,0x50(%rsp)
1729 vmovdqa $t2,0x60(%rsp)
1731 vmovdqa $t3,0x70(%rsp)
1737 add \$`16*2*$SZ`,$Tbl
1739 sub Xupdate_512_AVX () {
1741 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
1742 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
1743 '&vpsrlq ($t2,$t0,$sigma0[0])',
1744 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
1745 '&vpsrlq ($t3,$t0,$sigma0[2])',
1746 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
1747 '&vpxor ($t0,$t3,$t2)',
1748 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1749 '&vpxor ($t0,$t0,$t1)',
1750 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1751 '&vpxor ($t0,$t0,$t2)',
1752 '&vpsrlq ($t3,@X[7],$sigma1[2]);',
1753 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
1754 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
1755 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
1756 '&vpsrlq ($t1,@X[7],$sigma1[0]);',
1757 '&vpxor ($t3,$t3,$t2)',
1758 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
1759 '&vpxor ($t3,$t3,$t1)',
1760 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
1761 '&vpxor ($t3,$t3,$t2)',
1762 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
1763 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
1767 sub AVX_512_00_47 () {
1771 my @insns = (&$body,&$body); # 52 instructions
1773 foreach (Xupdate_512_AVX()) { # 23 instructions
1775 eval(shift(@insns));
1776 eval(shift(@insns));
1778 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1779 foreach (@insns) { eval; } # remaining instructions
1780 &vmovdqa (16*$j."(%rsp)",$t2);
1783 for ($i=0,$j=0; $j<8; $j++) {
1784 &AVX_512_00_47($j,\&body_00_15,@X);
1785 push(@X,shift(@X)); # rotate(@X)
1787 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1788 &jne (".Lavx_00_47");
1790 for ($i=0; $i<16; ) {
1791 foreach(body_00_15()) { eval; }
1799 lea 16*$SZ($inp),$inp
1824 $code.=<<___ if ($win64);
1825 movaps 16*$SZ+32(%rsp),%xmm6
1826 movaps 16*$SZ+48(%rsp),%xmm7
1827 movaps 16*$SZ+64(%rsp),%xmm8
1828 movaps 16*$SZ+80(%rsp),%xmm9
1830 $code.=<<___ if ($win64 && $SZ>4);
1831 movaps 16*$SZ+96(%rsp),%xmm10
1832 movaps 16*$SZ+112(%rsp),%xmm11
1848 .cfi_def_cfa_register %rsp
1852 .size ${func}_avx,.-${func}_avx
1856 ######################################################################
1857 # AVX2+BMI code path
1859 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
1863 sub bodyx_00_15 () {
1864 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1866 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1868 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
1869 '&and ($a4,$e)', # f&e
1870 '&rorx ($a0,$e,$Sigma1[2])',
1871 '&rorx ($a2,$e,$Sigma1[1])',
1873 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
1874 '&lea ($h,"($h,$a4)")',
1875 '&andn ($a4,$e,$g)', # ~e&g
1878 '&rorx ($a1,$e,$Sigma1[0])',
1879 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
1880 '&xor ($a0,$a1)', # Sigma1(e)
1883 '&rorx ($a4,$a,$Sigma0[2])',
1884 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
1885 '&xor ($a2,$b)', # a^b, b^c in next round
1886 '&rorx ($a1,$a,$Sigma0[1])',
1888 '&rorx ($a0,$a,$Sigma0[0])',
1889 '&lea ($d,"($d,$h)")', # d+=h
1890 '&and ($a3,$a2)', # (b^c)&(a^b)
1893 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
1894 '&xor ($a1,$a0)', # Sigma0(a)
1895 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
1896 '&mov ($a4,$e)', # copy of f in future
1898 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1900 # and at the finish one has to $a+=$a1
1904 .type ${func}_avx2,\@function,3
1909 mov %rsp,%rax # copy %rsp
1910 .cfi_def_cfa_register %rax
1923 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1924 shl \$4,%rdx # num*16
1925 and \$-256*$SZ,%rsp # align stack frame
1926 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1927 add \$`2*$SZ*($rounds-8)`,%rsp
1928 mov $ctx,$_ctx # save ctx, 1st arg
1929 mov $inp,$_inp # save inp, 2nd arh
1930 mov %rdx,$_end # save end pointer, "3rd" arg
1931 mov %rax,$_rsp # save copy of %rsp
1932 .cfi_cfa_expression $_rsp,deref,+8
1934 $code.=<<___ if ($win64);
1935 movaps %xmm6,16*$SZ+32(%rsp)
1936 movaps %xmm7,16*$SZ+48(%rsp)
1937 movaps %xmm8,16*$SZ+64(%rsp)
1938 movaps %xmm9,16*$SZ+80(%rsp)
1940 $code.=<<___ if ($win64 && $SZ>4);
1941 movaps %xmm10,16*$SZ+96(%rsp)
1942 movaps %xmm11,16*$SZ+112(%rsp)
1948 sub \$-16*$SZ,$inp # inp++, size optimization
1950 mov $inp,%r12 # borrow $T1
1952 cmp %rdx,$inp # $_end
1954 cmove %rsp,%r12 # next block or random data
1961 if ($SZ==4) { # SHA256
1962 my @X = map("%ymm$_",(0..3));
1963 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1966 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1967 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1971 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1972 vmovdqu -16*$SZ+0($inp),%xmm0
1973 vmovdqu -16*$SZ+16($inp),%xmm1
1974 vmovdqu -16*$SZ+32($inp),%xmm2
1975 vmovdqu -16*$SZ+48($inp),%xmm3
1976 #mov $inp,$_inp # offload $inp
1977 vinserti128 \$1,(%r12),@X[0],@X[0]
1978 vinserti128 \$1,16(%r12),@X[1],@X[1]
1979 vpshufb $t3,@X[0],@X[0]
1980 vinserti128 \$1,32(%r12),@X[2],@X[2]
1981 vpshufb $t3,@X[1],@X[1]
1982 vinserti128 \$1,48(%r12),@X[3],@X[3]
1984 lea $TABLE(%rip),$Tbl
1985 vpshufb $t3,@X[2],@X[2]
1986 vpaddd 0x00($Tbl),@X[0],$t0
1987 vpshufb $t3,@X[3],@X[3]
1988 vpaddd 0x20($Tbl),@X[1],$t1
1989 vpaddd 0x40($Tbl),@X[2],$t2
1990 vpaddd 0x60($Tbl),@X[3],$t3
1991 vmovdqa $t0,0x00(%rsp)
1993 vmovdqa $t1,0x20(%rsp)
1994 lea -$PUSH8(%rsp),%rsp
1996 vmovdqa $t2,0x00(%rsp)
1998 vmovdqa $t3,0x20(%rsp)
2000 sub \$-16*2*$SZ,$Tbl # size optimization
2007 sub AVX2_256_00_47 () {
2011 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
2012 my $base = "+2*$PUSH8(%rsp)";
2014 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
2015 foreach (Xupdate_256_AVX()) { # 29 instructions
2017 eval(shift(@insns));
2018 eval(shift(@insns));
2019 eval(shift(@insns));
2021 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
2022 foreach (@insns) { eval; } # remaining instructions
2023 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2026 for ($i=0,$j=0; $j<4; $j++) {
2027 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
2028 push(@X,shift(@X)); # rotate(@X)
2030 &lea ($Tbl,16*2*$SZ."($Tbl)");
2031 &cmpb (($SZ-1)."($Tbl)",0);
2032 &jne (".Lavx2_00_47");
2034 for ($i=0; $i<16; ) {
2035 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2036 foreach(bodyx_00_15()) { eval; }
2039 my @X = map("%ymm$_",(0..7));
2040 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
2046 vmovdqu -16*$SZ($inp),%xmm0
2047 vmovdqu -16*$SZ+16($inp),%xmm1
2048 vmovdqu -16*$SZ+32($inp),%xmm2
2049 lea $TABLE+0x80(%rip),$Tbl # size optimization
2050 vmovdqu -16*$SZ+48($inp),%xmm3
2051 vmovdqu -16*$SZ+64($inp),%xmm4
2052 vmovdqu -16*$SZ+80($inp),%xmm5
2053 vmovdqu -16*$SZ+96($inp),%xmm6
2054 vmovdqu -16*$SZ+112($inp),%xmm7
2055 #mov $inp,$_inp # offload $inp
2056 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
2057 vinserti128 \$1,(%r12),@X[0],@X[0]
2058 vinserti128 \$1,16(%r12),@X[1],@X[1]
2059 vpshufb $t2,@X[0],@X[0]
2060 vinserti128 \$1,32(%r12),@X[2],@X[2]
2061 vpshufb $t2,@X[1],@X[1]
2062 vinserti128 \$1,48(%r12),@X[3],@X[3]
2063 vpshufb $t2,@X[2],@X[2]
2064 vinserti128 \$1,64(%r12),@X[4],@X[4]
2065 vpshufb $t2,@X[3],@X[3]
2066 vinserti128 \$1,80(%r12),@X[5],@X[5]
2067 vpshufb $t2,@X[4],@X[4]
2068 vinserti128 \$1,96(%r12),@X[6],@X[6]
2069 vpshufb $t2,@X[5],@X[5]
2070 vinserti128 \$1,112(%r12),@X[7],@X[7]
2072 vpaddq -0x80($Tbl),@X[0],$t0
2073 vpshufb $t2,@X[6],@X[6]
2074 vpaddq -0x60($Tbl),@X[1],$t1
2075 vpshufb $t2,@X[7],@X[7]
2076 vpaddq -0x40($Tbl),@X[2],$t2
2077 vpaddq -0x20($Tbl),@X[3],$t3
2078 vmovdqa $t0,0x00(%rsp)
2079 vpaddq 0x00($Tbl),@X[4],$t0
2080 vmovdqa $t1,0x20(%rsp)
2081 vpaddq 0x20($Tbl),@X[5],$t1
2082 vmovdqa $t2,0x40(%rsp)
2083 vpaddq 0x40($Tbl),@X[6],$t2
2084 vmovdqa $t3,0x60(%rsp)
2085 lea -$PUSH8(%rsp),%rsp
2086 vpaddq 0x60($Tbl),@X[7],$t3
2087 vmovdqa $t0,0x00(%rsp)
2089 vmovdqa $t1,0x20(%rsp)
2091 vmovdqa $t2,0x40(%rsp)
2093 vmovdqa $t3,0x60(%rsp)
2102 sub AVX2_512_00_47 () {
2106 my @insns = (&$body,&$body); # 48 instructions
2107 my $base = "+2*$PUSH8(%rsp)";
2109 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0);
2110 foreach (Xupdate_512_AVX()) { # 23 instructions
2113 eval(shift(@insns));
2114 eval(shift(@insns));
2115 eval(shift(@insns));
2118 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
2119 foreach (@insns) { eval; } # remaining instructions
2120 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2123 for ($i=0,$j=0; $j<8; $j++) {
2124 &AVX2_512_00_47($j,\&bodyx_00_15,@X);
2125 push(@X,shift(@X)); # rotate(@X)
2127 &lea ($Tbl,16*2*$SZ."($Tbl)");
2128 &cmpb (($SZ-1-0x80)."($Tbl)",0);
2129 &jne (".Lavx2_00_47");
2131 for ($i=0; $i<16; ) {
2132 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2133 foreach(bodyx_00_15()) { eval; }
2137 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2139 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2140 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
2160 cmp `$PUSH8+2*8`($Tbl),$inp # $_end
2171 for ($i=0; $i<8; ) {
2172 my $base="+16($Tbl)";
2173 foreach(bodyx_00_15()) { eval; }
2176 lea -$PUSH8($Tbl),$Tbl
2180 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2182 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2183 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
2191 lea `2*16*$SZ`($inp),$inp # inp+=2
2198 cmove %rsp,%r12 # next block or stale data
2216 $code.=<<___ if ($win64);
2217 movaps 16*$SZ+32(%rsp),%xmm6
2218 movaps 16*$SZ+48(%rsp),%xmm7
2219 movaps 16*$SZ+64(%rsp),%xmm8
2220 movaps 16*$SZ+80(%rsp),%xmm9
2222 $code.=<<___ if ($win64 && $SZ>4);
2223 movaps 16*$SZ+96(%rsp),%xmm10
2224 movaps 16*$SZ+112(%rsp),%xmm11
2240 .cfi_def_cfa_register %rsp
2244 .size ${func}_avx2,.-${func}_avx2
2249 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2250 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2258 .extern __imp_RtlVirtualUnwind
2259 .type se_handler,\@abi-omnipotent
2273 mov 120($context),%rax # pull context->Rax
2274 mov 248($context),%rbx # pull context->Rip
2276 mov 8($disp),%rsi # disp->ImageBase
2277 mov 56($disp),%r11 # disp->HanderlData
2279 mov 0(%r11),%r10d # HandlerData[0]
2280 lea (%rsi,%r10),%r10 # prologue label
2281 cmp %r10,%rbx # context->Rip<prologue label
2284 mov 152($context),%rax # pull context->Rsp
2286 mov 4(%r11),%r10d # HandlerData[1]
2287 lea (%rsi,%r10),%r10 # epilogue label
2288 cmp %r10,%rbx # context->Rip>=epilogue label
2291 $code.=<<___ if ($avx>1);
2292 lea .Lavx2_shortcut(%rip),%r10
2293 cmp %r10,%rbx # context->Rip<avx2_shortcut
2297 add \$`2*$SZ*($rounds-8)`,%rax
2301 mov %rax,%rsi # put aside Rsp
2302 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
2310 mov %rbx,144($context) # restore context->Rbx
2311 mov %rbp,160($context) # restore context->Rbp
2312 mov %r12,216($context) # restore context->R12
2313 mov %r13,224($context) # restore context->R13
2314 mov %r14,232($context) # restore context->R14
2315 mov %r15,240($context) # restore context->R15
2317 lea .Lepilogue(%rip),%r10
2319 jb .Lin_prologue # non-AVX code
2321 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
2322 lea 512($context),%rdi # &context.Xmm6
2323 mov \$`$SZ==4?8:12`,%ecx
2324 .long 0xa548f3fc # cld; rep movsq
2329 mov %rax,152($context) # restore context->Rsp
2330 mov %rsi,168($context) # restore context->Rsi
2331 mov %rdi,176($context) # restore context->Rdi
2333 mov 40($disp),%rdi # disp->ContextRecord
2334 mov $context,%rsi # context
2335 mov \$154,%ecx # sizeof(CONTEXT)
2336 .long 0xa548f3fc # cld; rep movsq
2339 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2340 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2341 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2342 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2343 mov 40(%rsi),%r10 # disp->ContextRecord
2344 lea 56(%rsi),%r11 # &disp->HandlerData
2345 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2346 mov %r10,32(%rsp) # arg5
2347 mov %r11,40(%rsp) # arg6
2348 mov %r12,48(%rsp) # arg7
2349 mov %rcx,56(%rsp) # arg8, (NULL)
2350 call *__imp_RtlVirtualUnwind(%rip)
2352 mov \$1,%eax # ExceptionContinueSearch
2364 .size se_handler,.-se_handler
2367 $code.=<<___ if ($SZ==4 && $shaext);
2368 .type shaext_handler,\@abi-omnipotent
2382 mov 120($context),%rax # pull context->Rax
2383 mov 248($context),%rbx # pull context->Rip
2385 lea .Lprologue_shaext(%rip),%r10
2386 cmp %r10,%rbx # context->Rip<.Lprologue
2389 lea .Lepilogue_shaext(%rip),%r10
2390 cmp %r10,%rbx # context->Rip>=.Lepilogue
2393 lea -8-5*16(%rax),%rsi
2394 lea 512($context),%rdi # &context.Xmm6
2396 .long 0xa548f3fc # cld; rep movsq
2399 .size shaext_handler,.-shaext_handler
2405 .rva .LSEH_begin_$func
2406 .rva .LSEH_end_$func
2407 .rva .LSEH_info_$func
2409 $code.=<<___ if ($SZ==4 && $shaext);
2410 .rva .LSEH_begin_${func}_shaext
2411 .rva .LSEH_end_${func}_shaext
2412 .rva .LSEH_info_${func}_shaext
2414 $code.=<<___ if ($SZ==4);
2415 .rva .LSEH_begin_${func}_ssse3
2416 .rva .LSEH_end_${func}_ssse3
2417 .rva .LSEH_info_${func}_ssse3
2419 $code.=<<___ if ($avx && $SZ==8);
2420 .rva .LSEH_begin_${func}_xop
2421 .rva .LSEH_end_${func}_xop
2422 .rva .LSEH_info_${func}_xop
2424 $code.=<<___ if ($avx);
2425 .rva .LSEH_begin_${func}_avx
2426 .rva .LSEH_end_${func}_avx
2427 .rva .LSEH_info_${func}_avx
2429 $code.=<<___ if ($avx>1);
2430 .rva .LSEH_begin_${func}_avx2
2431 .rva .LSEH_end_${func}_avx2
2432 .rva .LSEH_info_${func}_avx2
2440 .rva .Lprologue,.Lepilogue # HandlerData[]
2442 $code.=<<___ if ($SZ==4 && $shaext);
2443 .LSEH_info_${func}_shaext:
2447 $code.=<<___ if ($SZ==4);
2448 .LSEH_info_${func}_ssse3:
2451 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
2453 $code.=<<___ if ($avx && $SZ==8);
2454 .LSEH_info_${func}_xop:
2457 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
2459 $code.=<<___ if ($avx);
2460 .LSEH_info_${func}_avx:
2463 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
2465 $code.=<<___ if ($avx>1);
2466 .LSEH_info_${func}_avx2:
2469 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2476 "sha256rnds2" => 0xcb,
2477 "sha256msg1" => 0xcc,
2478 "sha256msg2" => 0xcd );
2480 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2481 my @opcode=(0x0f,0x38);
2482 push @opcode,$opcodelet{$instr};
2483 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
2484 return ".byte\t".join(',',@opcode);
2486 return $instr."\t".@_[0];
2490 foreach (split("\n",$code)) {
2491 s/\`([^\`]*)\`/eval $1/geo;
2493 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;