2 # Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. Rights for redistribution and usage in source and binary
13 # forms are granted according to the License.
14 # ====================================================================
16 # sha256/512_block procedure for x86_64.
18 # 40% improvement over compiler-generated code on Opteron. On EM64T
19 # sha256 was observed to run >80% faster and sha512 - >40%. No magical
20 # tricks, just straight implementation... I really wonder why gcc
21 # [being armed with inline assembler] fails to generate as fast code.
22 # The only thing which is cool about this module is that it's very
23 # same instruction sequence used for both SHA-256 and SHA-512. In
24 # former case the instructions operate on 32-bit operands, while in
25 # latter - on 64-bit ones. All I had to do is to get one flavor right,
26 # the other one passed the test right away:-)
28 # sha256_block runs in ~1005 cycles on Opteron, which gives you
29 # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
30 # frequency in GHz. sha512_block runs in ~1275 cycles, which results
31 # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
32 # Well, if you compare it to IA-64 implementation, which maintains
33 # X[16] in register bank[!], tends to 4 instructions per CPU clock
34 # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
35 # issue Opteron pipeline and X[16] maintained in memory. So that *if*
36 # there is a way to improve it, *then* the only way would be to try to
37 # offload X[16] updates to SSE unit, but that would require "deeper"
38 # loop unroll, which in turn would naturally cause size blow-up, not
39 # to mention increased complexity! And once again, only *if* it's
40 # actually possible to noticeably improve overall ILP, instruction
41 # level parallelism, on a given CPU implementation in this case.
43 # Special note on Intel EM64T. While Opteron CPU exhibits perfect
44 # performance ratio of 1.5 between 64- and 32-bit flavors [see above],
45 # [currently available] EM64T CPUs apparently are far from it. On the
46 # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
47 # sha256_block:-( This is presumably because 64-bit shifts/rotates
48 # apparently are not atomic instructions, but implemented in microcode.
52 # Optimization including one of Pavel Semjanov's ideas, alternative
53 # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
54 # unfortunately -2% SHA512 on P4 [which nobody should care about
59 # Add SIMD code paths, see below for improvement coefficients. SSSE3
60 # code path was not attempted for SHA512, because improvement is not
61 # estimated to be high enough, noticeably less than 9%, to justify
62 # the effort, not on pre-AVX processors. [Obviously with exclusion
63 # for VIA Nano, but it has SHA512 instruction that is faster and
64 # should be used instead.] For reference, corresponding estimated
65 # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
66 # higher coefficients are observed on VIA Nano and Bulldozer has more
67 # to do with specifics of their architecture [which is topic for
68 # separate discussion].
72 # Add AVX2 code path. Two consecutive input blocks are loaded to
73 # 256-bit %ymm registers, with data from first block to least
74 # significant 128-bit halves and data from second to most significant.
75 # The data is then processed with same SIMD instruction sequence as
76 # for AVX, but with %ymm as operands. Side effect is increased stack
77 # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
82 # Add support for Intel SHA Extensions.
84 ######################################################################
85 # Current performance in cycles per processed byte (less is better):
87 # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
89 # AMD K8 14.9 - - 9.57 -
91 # Core 2 15.6 13.8(+13%) - 9.97 -
92 # Westmere 14.8 12.3(+19%) - 9.58 -
93 # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
94 # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
95 # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
96 # Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%)
97 # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
98 # Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%)
99 # VIA Nano 23.0 16.5(+39%) - 14.7 -
100 # Atom 23.0 18.9(+22%) - 14.7 -
101 # Silvermont 27.4 20.6(+33%) - 17.5 -
102 # Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%)
103 # Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 -
105 # (*) whichever best applicable, including SHAEXT;
106 # (**) switch from ror to shrd stands for fair share of improvement;
107 # (***) execution time is fully determined by remaining integer-only
108 # part, body_00_15; reducing the amount of SIMD instructions
109 # below certain limit makes no difference/sense; to conserve
110 # space SHA256 XOP code path is therefore omitted;
112 # $output is the last argument if it looks like a file (it has an extension)
113 # $flavour is the first argument if it doesn't look like a file
114 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
115 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
117 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
119 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
120 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
121 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
122 die "can't locate x86_64-xlate.pl";
124 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
125 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
126 $avx = ($1>=2.19) + ($1>=2.22);
129 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
130 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
131 $avx = ($1>=2.09) + ($1>=2.10);
134 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
135 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
136 $avx = ($1>=10) + ($1>=11);
139 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
140 $avx = ($2>=3.0) + ($2>3.0);
143 $shaext=1; ### set to zero if compiling for 1.0.1
144 $avx=1 if (!$shaext && $avx);
146 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
147 or die "can't call $xlate: $!";
150 if ($output =~ /512/) {
151 $func="sha512_block_data_order";
154 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
155 "%r8", "%r9", "%r10","%r11");
156 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
163 $func="sha256_block_data_order";
166 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
167 "%r8d","%r9d","%r10d","%r11d");
168 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
176 $ctx="%rdi"; # 1st arg, zapped by $a3
177 $inp="%rsi"; # 2nd arg
180 $_ctx="16*$SZ+0*8(%rsp)";
181 $_inp="16*$SZ+1*8(%rsp)";
182 $_end="16*$SZ+2*8(%rsp)";
183 $_rsp="`16*$SZ+3*8`(%rsp)";
184 $framesz="16*$SZ+4*8";
188 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
190 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
193 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
197 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
200 mov $T1,`$SZ*($i&0xf)`(%rsp)
204 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
206 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
208 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
210 add $a2,$T1 # T1+=Ch(e,f,g)
213 add ($Tbl),$T1 # T1+=K[round]
216 xor $b,$a2 # a^b, b^c in next round
217 ror \$$Sigma1[0],$a0 # Sigma1(e)
221 ror \$$Sigma0[0],$a1 # Sigma0(a)
222 add $a0,$T1 # T1+=Sigma1(e)
224 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
228 lea $STRIDE($Tbl),$Tbl # round++
230 $code.=<<___ if ($i<15);
231 add $a1,$h # h+=Sigma0(a)
233 ($a2,$a3) = ($a3,$a2);
237 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
240 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
241 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
244 ror \$`$sigma0[1]-$sigma0[0]`,$a0
245 add $a1,$a # modulo-scheduled h+=Sigma0(a)
247 ror \$`$sigma1[1]-$sigma1[0]`,$a2
256 xor $a0,$T1 # sigma0(X[(i+1)&0xf])
257 xor $a1,$a2 # sigma1(X[(i+14)&0xf])
258 add `$SZ*(($i+9)&0xf)`(%rsp),$T1
260 add `$SZ*($i&0xf)`(%rsp),$T1
271 .extern OPENSSL_ia32cap_P
273 .type $func,\@function,3
278 $code.=<<___ if ($SZ==4 || $avx);
279 lea OPENSSL_ia32cap_P(%rip),%r11
284 $code.=<<___ if ($SZ==4 && $shaext);
285 test \$`1<<29`,%r11d # check for SHA
288 $code.=<<___ if ($avx && $SZ==8);
289 test \$`1<<11`,%r10d # check for XOP
292 $code.=<<___ if ($avx>1);
293 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
294 cmp \$`1<<8|1<<5|1<<3`,%r11d
297 $code.=<<___ if ($avx);
298 and \$`1<<30`,%r9d # mask "Intel CPU" bit
299 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
301 cmp \$`1<<28|1<<9|1<<30`,%r10d
304 $code.=<<___ if ($SZ==4);
309 mov %rsp,%rax # copy %rsp
310 .cfi_def_cfa_register %rax
323 shl \$4,%rdx # num*16
325 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
326 and \$-64,%rsp # align stack frame
327 mov $ctx,$_ctx # save ctx, 1st arg
328 mov $inp,$_inp # save inp, 2nd arh
329 mov %rdx,$_end # save end pointer, "3rd" arg
330 mov %rax,$_rsp # save copy of %rsp
331 .cfi_cfa_expression $_rsp,deref,+8
347 lea $TABLE(%rip),$Tbl
350 for($i=0;$i<16;$i++) {
351 $code.=" mov $SZ*$i($inp),$T1\n";
352 $code.=" mov @ROT[4],$a0\n";
353 $code.=" mov @ROT[0],$a1\n";
354 $code.=" bswap $T1\n";
355 &ROUND_00_15($i,@ROT);
356 unshift(@ROT,pop(@ROT));
364 &ROUND_16_XX($i,@ROT);
365 unshift(@ROT,pop(@ROT));
369 cmpb \$0,`$SZ-1`($Tbl)
373 add $a1,$A # modulo-scheduled h+=Sigma0(a)
374 lea 16*$SZ($inp),$inp
412 .cfi_def_cfa_register %rsp
422 .type $TABLE,\@object
424 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
425 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
426 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
427 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
428 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
429 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
430 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
431 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
432 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
433 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
434 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
435 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
436 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
437 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
438 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
439 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
440 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
441 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
442 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
443 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
444 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
445 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
446 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
447 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
448 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
449 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
450 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
451 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
452 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
453 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
454 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
455 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
457 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
458 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
459 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
460 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
461 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
462 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
463 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
468 .type $TABLE,\@object
470 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
471 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
472 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
473 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
474 .quad 0x3956c25bf348b538,0x59f111f1b605d019
475 .quad 0x3956c25bf348b538,0x59f111f1b605d019
476 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
477 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
478 .quad 0xd807aa98a3030242,0x12835b0145706fbe
479 .quad 0xd807aa98a3030242,0x12835b0145706fbe
480 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
481 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
482 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
483 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
484 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
485 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
486 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
487 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
488 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
489 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
490 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
491 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
492 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
493 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
494 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
495 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
496 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
497 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
498 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
499 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
500 .quad 0x06ca6351e003826f,0x142929670a0e6e70
501 .quad 0x06ca6351e003826f,0x142929670a0e6e70
502 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
503 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
504 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
505 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
506 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
507 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
508 .quad 0x81c2c92e47edaee6,0x92722c851482353b
509 .quad 0x81c2c92e47edaee6,0x92722c851482353b
510 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
511 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
512 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
513 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
514 .quad 0xd192e819d6ef5218,0xd69906245565a910
515 .quad 0xd192e819d6ef5218,0xd69906245565a910
516 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
517 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
518 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
519 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
520 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
521 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
522 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
523 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
524 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
525 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
526 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
527 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
528 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
529 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
530 .quad 0x90befffa23631e28,0xa4506cebde82bde9
531 .quad 0x90befffa23631e28,0xa4506cebde82bde9
532 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
533 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
534 .quad 0xca273eceea26619c,0xd186b8c721c0c207
535 .quad 0xca273eceea26619c,0xd186b8c721c0c207
536 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
537 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
538 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
539 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
540 .quad 0x113f9804bef90dae,0x1b710b35131c471b
541 .quad 0x113f9804bef90dae,0x1b710b35131c471b
542 .quad 0x28db77f523047d84,0x32caab7b40c72493
543 .quad 0x28db77f523047d84,0x32caab7b40c72493
544 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
545 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
546 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
547 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
548 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
549 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
551 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
552 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
553 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
557 ######################################################################
560 if ($SZ==4 && $shaext) {{{
561 ######################################################################
562 # Intel SHA Extensions implementation of SHA256 update function.
564 my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
566 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
567 my @MSG=map("%xmm$_",(3..6));
570 .type sha256_block_data_order_shaext,\@function,3
572 sha256_block_data_order_shaext:
576 $code.=<<___ if ($win64);
577 lea `-8-5*16`(%rsp),%rsp
578 movaps %xmm6,-8-5*16(%rax)
579 movaps %xmm7,-8-4*16(%rax)
580 movaps %xmm8,-8-3*16(%rax)
581 movaps %xmm9,-8-2*16(%rax)
582 movaps %xmm10,-8-1*16(%rax)
586 lea K256+0x80(%rip),$Tbl
587 movdqu ($ctx),$ABEF # DCBA
588 movdqu 16($ctx),$CDGH # HGFE
589 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
591 pshufd \$0x1b,$ABEF,$Wi # ABCD
592 pshufd \$0xb1,$ABEF,$ABEF # CDAB
593 pshufd \$0x1b,$CDGH,$CDGH # EFGH
594 movdqa $TMP,$BSWAP # offload
595 palignr \$8,$CDGH,$ABEF # ABEF
596 punpcklqdq $Wi,$CDGH # CDGH
601 movdqu ($inp),@MSG[0]
602 movdqu 0x10($inp),@MSG[1]
603 movdqu 0x20($inp),@MSG[2]
605 movdqu 0x30($inp),@MSG[3]
607 movdqa 0*32-0x80($Tbl),$Wi
610 movdqa $CDGH,$CDGH_SAVE # offload
611 sha256rnds2 $ABEF,$CDGH # 0-3
612 pshufd \$0x0e,$Wi,$Wi
614 movdqa $ABEF,$ABEF_SAVE # offload
615 sha256rnds2 $CDGH,$ABEF
617 movdqa 1*32-0x80($Tbl),$Wi
620 sha256rnds2 $ABEF,$CDGH # 4-7
621 pshufd \$0x0e,$Wi,$Wi
623 sha256msg1 @MSG[1],@MSG[0]
624 sha256rnds2 $CDGH,$ABEF
626 movdqa 2*32-0x80($Tbl),$Wi
629 sha256rnds2 $ABEF,$CDGH # 8-11
630 pshufd \$0x0e,$Wi,$Wi
632 palignr \$4,@MSG[2],$TMP
635 sha256msg1 @MSG[2],@MSG[1]
636 sha256rnds2 $CDGH,$ABEF
638 movdqa 3*32-0x80($Tbl),$Wi
640 sha256msg2 @MSG[3],@MSG[0]
641 sha256rnds2 $ABEF,$CDGH # 12-15
642 pshufd \$0x0e,$Wi,$Wi
644 palignr \$4,@MSG[3],$TMP
647 sha256msg1 @MSG[3],@MSG[2]
648 sha256rnds2 $CDGH,$ABEF
650 for($i=4;$i<16-3;$i++) {
652 movdqa $i*32-0x80($Tbl),$Wi
654 sha256msg2 @MSG[0],@MSG[1]
655 sha256rnds2 $ABEF,$CDGH # 16-19...
656 pshufd \$0x0e,$Wi,$Wi
658 palignr \$4,@MSG[0],$TMP
661 sha256msg1 @MSG[0],@MSG[3]
662 sha256rnds2 $CDGH,$ABEF
664 push(@MSG,shift(@MSG));
667 movdqa 13*32-0x80($Tbl),$Wi
669 sha256msg2 @MSG[0],@MSG[1]
670 sha256rnds2 $ABEF,$CDGH # 52-55
671 pshufd \$0x0e,$Wi,$Wi
673 palignr \$4,@MSG[0],$TMP
674 sha256rnds2 $CDGH,$ABEF
677 movdqa 14*32-0x80($Tbl),$Wi
679 sha256rnds2 $ABEF,$CDGH # 56-59
680 pshufd \$0x0e,$Wi,$Wi
681 sha256msg2 @MSG[1],@MSG[2]
683 sha256rnds2 $CDGH,$ABEF
685 movdqa 15*32-0x80($Tbl),$Wi
688 sha256rnds2 $ABEF,$CDGH # 60-63
689 pshufd \$0x0e,$Wi,$Wi
692 sha256rnds2 $CDGH,$ABEF
694 paddd $CDGH_SAVE,$CDGH
695 paddd $ABEF_SAVE,$ABEF
698 pshufd \$0xb1,$CDGH,$CDGH # DCHG
699 pshufd \$0x1b,$ABEF,$TMP # FEBA
700 pshufd \$0xb1,$ABEF,$ABEF # BAFE
701 punpckhqdq $CDGH,$ABEF # DCBA
702 palignr \$8,$TMP,$CDGH # HGFE
705 movdqu $CDGH,16($ctx)
707 $code.=<<___ if ($win64);
708 movaps -8-5*16(%rax),%xmm6
709 movaps -8-4*16(%rax),%xmm7
710 movaps -8-3*16(%rax),%xmm8
711 movaps -8-2*16(%rax),%xmm9
712 movaps -8-1*16(%rax),%xmm10
719 .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
725 my ($a,$b,$c,$d,$e,$f,$g,$h);
727 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
728 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
730 $arg = "\$$arg" if ($arg*1 eq $arg);
731 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
736 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
738 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
742 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
744 '&xor ($a4,$g)', # f^g
746 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
748 '&and ($a4,$e)', # (f^g)&e
751 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
754 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
755 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
756 '&xor ($a2,$b)', # a^b, b^c in next round
758 '&add ($h,$a4)', # h+=Ch(e,f,g)
759 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
760 '&and ($a3,$a2)', # (b^c)&(a^b)
763 '&add ($h,$a0)', # h+=Sigma1(e)
764 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
766 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
767 '&add ($d,$h)', # d+=h
768 '&add ($h,$a3)', # h+=Maj(a,b,c)
771 '&add ($a1,$h);'. # h+=Sigma0(a)
772 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
776 ######################################################################
779 if ($SZ==4) { # SHA256 only
780 my @X = map("%xmm$_",(0..3));
781 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
784 .type ${func}_ssse3,\@function,3
789 mov %rsp,%rax # copy %rsp
790 .cfi_def_cfa_register %rax
803 shl \$4,%rdx # num*16
804 sub \$`$framesz+$win64*16*4`,%rsp
805 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
806 and \$-64,%rsp # align stack frame
807 mov $ctx,$_ctx # save ctx, 1st arg
808 mov $inp,$_inp # save inp, 2nd arh
809 mov %rdx,$_end # save end pointer, "3rd" arg
810 mov %rax,$_rsp # save copy of %rsp
811 .cfi_cfa_expression $_rsp,deref,+8
813 $code.=<<___ if ($win64);
814 movaps %xmm6,16*$SZ+32(%rsp)
815 movaps %xmm7,16*$SZ+48(%rsp)
816 movaps %xmm8,16*$SZ+64(%rsp)
817 movaps %xmm9,16*$SZ+80(%rsp)
833 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
834 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
838 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
839 movdqu 0x00($inp),@X[0]
840 movdqu 0x10($inp),@X[1]
841 movdqu 0x20($inp),@X[2]
843 movdqu 0x30($inp),@X[3]
844 lea $TABLE(%rip),$Tbl
846 movdqa 0x00($Tbl),$t0
847 movdqa 0x20($Tbl),$t1
850 movdqa 0x40($Tbl),$t2
852 movdqa 0x60($Tbl),$t3
856 movdqa $t0,0x00(%rsp)
858 movdqa $t1,0x10(%rsp)
860 movdqa $t2,0x20(%rsp)
862 movdqa $t3,0x30(%rsp)
868 sub \$`-16*2*$SZ`,$Tbl # size optimization
870 sub Xupdate_256_SSSE3 () {
872 '&movdqa ($t0,@X[1]);',
873 '&movdqa ($t3,@X[3])',
874 '&palignr ($t0,@X[0],$SZ)', # X[1..4]
875 '&palignr ($t3,@X[2],$SZ);', # X[9..12]
877 '&movdqa ($t2,$t0);',
878 '&psrld ($t0,$sigma0[2])',
879 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
880 '&psrld ($t2,$sigma0[0])',
881 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
882 '&pslld ($t1,8*$SZ-$sigma0[1]);'.
884 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
886 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
889 '&pxor ($t0,$t1);', # sigma0(X[1..4])
890 '&psrld ($t3,$sigma1[2])',
891 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
892 '&psrlq ($t2,$sigma1[0])',
894 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
896 '&pshufb ($t3,$t4)', # sigma1(X[14..15])
897 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
898 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
899 '&movdqa ($t2,$t3);',
900 '&psrld ($t3,$sigma1[2])',
901 '&psrlq ($t2,$sigma1[0])',
903 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
905 '&movdqa ($t2,16*2*$j."($Tbl)")',
907 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
911 sub SSSE3_256_00_47 () {
915 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
918 foreach (Xupdate_256_SSSE3()) { # 36 instructions
924 } else { # squeeze extra 4% on Westmere and 19% on Atom
925 eval(shift(@insns)); #@
930 eval(shift(@insns)); #@
933 eval(shift(@insns)); #@
935 &palignr ($t0,@X[0],$SZ); # X[1..4]
938 &palignr ($t3,@X[2],$SZ); # X[9..12]
942 eval(shift(@insns)); #@
947 eval(shift(@insns)); #@
949 &psrld ($t0,$sigma0[2]);
953 &paddd (@X[0],$t3); # X[0..3] += X[9..12]
954 eval(shift(@insns)); #@
956 &psrld ($t2,$sigma0[0]);
959 &pshufd ($t3,@X[3],0b11111010); # X[4..15]
961 eval(shift(@insns)); #@
962 &pslld ($t1,8*$SZ-$sigma0[1]);
966 eval(shift(@insns)); #@
969 eval(shift(@insns)); #@
970 &psrld ($t2,$sigma0[1]-$sigma0[0]);
975 &pslld ($t1,$sigma0[1]-$sigma0[0]);
980 eval(shift(@insns)); #@
984 &pxor ($t0,$t1); # sigma0(X[1..4])
985 eval(shift(@insns)); #@
988 &psrld ($t3,$sigma1[2]);
991 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
992 eval(shift(@insns)); #@
994 &psrlq ($t2,$sigma1[0]);
999 eval(shift(@insns)); #@
1000 eval(shift(@insns));
1001 eval(shift(@insns));
1002 eval(shift(@insns)); #@
1003 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
1004 eval(shift(@insns));
1005 eval(shift(@insns));
1007 eval(shift(@insns)); #@
1008 eval(shift(@insns));
1009 eval(shift(@insns));
1010 #&pshufb ($t3,$t4); # sigma1(X[14..15])
1011 &pshufd ($t3,$t3,0b10000000);
1012 eval(shift(@insns));
1013 eval(shift(@insns));
1014 eval(shift(@insns));
1016 eval(shift(@insns));
1017 eval(shift(@insns)); #@
1018 eval(shift(@insns));
1019 eval(shift(@insns));
1020 eval(shift(@insns)); #@
1021 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1022 eval(shift(@insns));
1023 eval(shift(@insns));
1024 eval(shift(@insns));
1025 &pshufd ($t3,@X[0],0b01010000); # X[16..17]
1026 eval(shift(@insns));
1027 eval(shift(@insns)); #@
1028 eval(shift(@insns));
1030 eval(shift(@insns));
1031 eval(shift(@insns));
1032 &psrld ($t3,$sigma1[2]);
1033 eval(shift(@insns));
1034 eval(shift(@insns)); #@
1035 &psrlq ($t2,$sigma1[0]);
1036 eval(shift(@insns));
1037 eval(shift(@insns));
1039 eval(shift(@insns)); #@
1040 eval(shift(@insns));
1041 eval(shift(@insns));
1042 eval(shift(@insns)); #@
1043 eval(shift(@insns));
1044 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
1045 eval(shift(@insns));
1046 eval(shift(@insns));
1047 eval(shift(@insns));
1049 eval(shift(@insns));
1050 eval(shift(@insns));
1051 eval(shift(@insns)); #@
1053 &pshufd ($t3,$t3,0b00001000);
1054 eval(shift(@insns));
1055 eval(shift(@insns));
1056 &movdqa ($t2,16*2*$j."($Tbl)");
1057 eval(shift(@insns)); #@
1058 eval(shift(@insns));
1060 eval(shift(@insns));
1061 eval(shift(@insns));
1062 eval(shift(@insns));
1063 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1064 eval(shift(@insns)); #@
1065 eval(shift(@insns));
1066 eval(shift(@insns));
1069 foreach (@insns) { eval; } # remaining instructions
1070 &movdqa (16*$j."(%rsp)",$t2);
1073 for ($i=0,$j=0; $j<4; $j++) {
1074 &SSSE3_256_00_47($j,\&body_00_15,@X);
1075 push(@X,shift(@X)); # rotate(@X)
1077 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1078 &jne (".Lssse3_00_47");
1080 for ($i=0; $i<16; ) {
1081 foreach(body_00_15()) { eval; }
1088 lea 16*$SZ($inp),$inp
1112 $code.=<<___ if ($win64);
1113 movaps 16*$SZ+32(%rsp),%xmm6
1114 movaps 16*$SZ+48(%rsp),%xmm7
1115 movaps 16*$SZ+64(%rsp),%xmm8
1116 movaps 16*$SZ+80(%rsp),%xmm9
1132 .cfi_def_cfa_register %rsp
1136 .size ${func}_ssse3,.-${func}_ssse3
1141 ######################################################################
1144 if ($SZ==8) { # SHA512 only
1146 .type ${func}_xop,\@function,3
1151 mov %rsp,%rax # copy %rsp
1152 .cfi_def_cfa_register %rax
1165 shl \$4,%rdx # num*16
1166 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1167 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1168 and \$-64,%rsp # align stack frame
1169 mov $ctx,$_ctx # save ctx, 1st arg
1170 mov $inp,$_inp # save inp, 2nd arh
1171 mov %rdx,$_end # save end pointer, "3rd" arg
1172 mov %rax,$_rsp # save copy of %rsp
1173 .cfi_cfa_expression $_rsp,deref,+8
1175 $code.=<<___ if ($win64);
1176 movaps %xmm6,16*$SZ+32(%rsp)
1177 movaps %xmm7,16*$SZ+48(%rsp)
1178 movaps %xmm8,16*$SZ+64(%rsp)
1179 movaps %xmm9,16*$SZ+80(%rsp)
1181 $code.=<<___ if ($win64 && $SZ>4);
1182 movaps %xmm10,16*$SZ+96(%rsp)
1183 movaps %xmm11,16*$SZ+112(%rsp)
1199 if ($SZ==4) { # SHA256
1200 my @X = map("%xmm$_",(0..3));
1201 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
1206 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1207 vmovdqu 0x00($inp),@X[0]
1208 vmovdqu 0x10($inp),@X[1]
1209 vmovdqu 0x20($inp),@X[2]
1210 vmovdqu 0x30($inp),@X[3]
1211 vpshufb $t3,@X[0],@X[0]
1212 lea $TABLE(%rip),$Tbl
1213 vpshufb $t3,@X[1],@X[1]
1214 vpshufb $t3,@X[2],@X[2]
1215 vpaddd 0x00($Tbl),@X[0],$t0
1216 vpshufb $t3,@X[3],@X[3]
1217 vpaddd 0x20($Tbl),@X[1],$t1
1218 vpaddd 0x40($Tbl),@X[2],$t2
1219 vpaddd 0x60($Tbl),@X[3],$t3
1220 vmovdqa $t0,0x00(%rsp)
1222 vmovdqa $t1,0x10(%rsp)
1224 vmovdqa $t2,0x20(%rsp)
1226 vmovdqa $t3,0x30(%rsp)
1232 sub \$`-16*2*$SZ`,$Tbl # size optimization
1234 sub XOP_256_00_47 () {
1238 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1240 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
1241 eval(shift(@insns));
1242 eval(shift(@insns));
1243 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
1244 eval(shift(@insns));
1245 eval(shift(@insns));
1246 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
1247 eval(shift(@insns));
1248 eval(shift(@insns));
1249 &vpsrld ($t0,$t0,$sigma0[2]);
1250 eval(shift(@insns));
1251 eval(shift(@insns));
1252 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
1253 eval(shift(@insns));
1254 eval(shift(@insns));
1255 eval(shift(@insns));
1256 eval(shift(@insns));
1257 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
1258 eval(shift(@insns));
1259 eval(shift(@insns));
1260 &vpxor ($t0,$t0,$t1);
1261 eval(shift(@insns));
1262 eval(shift(@insns));
1263 eval(shift(@insns));
1264 eval(shift(@insns));
1265 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
1266 eval(shift(@insns));
1267 eval(shift(@insns));
1268 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
1269 eval(shift(@insns));
1270 eval(shift(@insns));
1271 &vpsrld ($t2,@X[3],$sigma1[2]);
1272 eval(shift(@insns));
1273 eval(shift(@insns));
1274 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
1275 eval(shift(@insns));
1276 eval(shift(@insns));
1277 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1278 eval(shift(@insns));
1279 eval(shift(@insns));
1280 &vpxor ($t3,$t3,$t2);
1281 eval(shift(@insns));
1282 eval(shift(@insns));
1283 eval(shift(@insns));
1284 eval(shift(@insns));
1285 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1286 eval(shift(@insns));
1287 eval(shift(@insns));
1288 eval(shift(@insns));
1289 eval(shift(@insns));
1290 &vpsrldq ($t3,$t3,8);
1291 eval(shift(@insns));
1292 eval(shift(@insns));
1293 eval(shift(@insns));
1294 eval(shift(@insns));
1295 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1296 eval(shift(@insns));
1297 eval(shift(@insns));
1298 eval(shift(@insns));
1299 eval(shift(@insns));
1300 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
1301 eval(shift(@insns));
1302 eval(shift(@insns));
1303 &vpsrld ($t2,@X[0],$sigma1[2]);
1304 eval(shift(@insns));
1305 eval(shift(@insns));
1306 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1307 eval(shift(@insns));
1308 eval(shift(@insns));
1309 &vpxor ($t3,$t3,$t2);
1310 eval(shift(@insns));
1311 eval(shift(@insns));
1312 eval(shift(@insns));
1313 eval(shift(@insns));
1314 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
1315 eval(shift(@insns));
1316 eval(shift(@insns));
1317 eval(shift(@insns));
1318 eval(shift(@insns));
1319 &vpslldq ($t3,$t3,8); # 22 instructions
1320 eval(shift(@insns));
1321 eval(shift(@insns));
1322 eval(shift(@insns));
1323 eval(shift(@insns));
1324 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1325 eval(shift(@insns));
1326 eval(shift(@insns));
1327 eval(shift(@insns));
1328 eval(shift(@insns));
1329 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1330 foreach (@insns) { eval; } # remaining instructions
1331 &vmovdqa (16*$j."(%rsp)",$t2);
1334 for ($i=0,$j=0; $j<4; $j++) {
1335 &XOP_256_00_47($j,\&body_00_15,@X);
1336 push(@X,shift(@X)); # rotate(@X)
1338 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1339 &jne (".Lxop_00_47");
1341 for ($i=0; $i<16; ) {
1342 foreach(body_00_15()) { eval; }
1346 my @X = map("%xmm$_",(0..7));
1347 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1352 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1353 vmovdqu 0x00($inp),@X[0]
1354 lea $TABLE+0x80(%rip),$Tbl # size optimization
1355 vmovdqu 0x10($inp),@X[1]
1356 vmovdqu 0x20($inp),@X[2]
1357 vpshufb $t3,@X[0],@X[0]
1358 vmovdqu 0x30($inp),@X[3]
1359 vpshufb $t3,@X[1],@X[1]
1360 vmovdqu 0x40($inp),@X[4]
1361 vpshufb $t3,@X[2],@X[2]
1362 vmovdqu 0x50($inp),@X[5]
1363 vpshufb $t3,@X[3],@X[3]
1364 vmovdqu 0x60($inp),@X[6]
1365 vpshufb $t3,@X[4],@X[4]
1366 vmovdqu 0x70($inp),@X[7]
1367 vpshufb $t3,@X[5],@X[5]
1368 vpaddq -0x80($Tbl),@X[0],$t0
1369 vpshufb $t3,@X[6],@X[6]
1370 vpaddq -0x60($Tbl),@X[1],$t1
1371 vpshufb $t3,@X[7],@X[7]
1372 vpaddq -0x40($Tbl),@X[2],$t2
1373 vpaddq -0x20($Tbl),@X[3],$t3
1374 vmovdqa $t0,0x00(%rsp)
1375 vpaddq 0x00($Tbl),@X[4],$t0
1376 vmovdqa $t1,0x10(%rsp)
1377 vpaddq 0x20($Tbl),@X[5],$t1
1378 vmovdqa $t2,0x20(%rsp)
1379 vpaddq 0x40($Tbl),@X[6],$t2
1380 vmovdqa $t3,0x30(%rsp)
1381 vpaddq 0x60($Tbl),@X[7],$t3
1382 vmovdqa $t0,0x40(%rsp)
1384 vmovdqa $t1,0x50(%rsp)
1386 vmovdqa $t2,0x60(%rsp)
1388 vmovdqa $t3,0x70(%rsp)
1394 add \$`16*2*$SZ`,$Tbl
1396 sub XOP_512_00_47 () {
1400 my @insns = (&$body,&$body); # 52 instructions
1402 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
1403 eval(shift(@insns));
1404 eval(shift(@insns));
1405 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
1406 eval(shift(@insns));
1407 eval(shift(@insns));
1408 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
1409 eval(shift(@insns));
1410 eval(shift(@insns));
1411 &vpsrlq ($t0,$t0,$sigma0[2]);
1412 eval(shift(@insns));
1413 eval(shift(@insns));
1414 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
1415 eval(shift(@insns));
1416 eval(shift(@insns));
1417 eval(shift(@insns));
1418 eval(shift(@insns));
1419 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
1420 eval(shift(@insns));
1421 eval(shift(@insns));
1422 &vpxor ($t0,$t0,$t1);
1423 eval(shift(@insns));
1424 eval(shift(@insns));
1425 eval(shift(@insns));
1426 eval(shift(@insns));
1427 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
1428 eval(shift(@insns));
1429 eval(shift(@insns));
1430 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
1431 eval(shift(@insns));
1432 eval(shift(@insns));
1433 &vpsrlq ($t2,@X[7],$sigma1[2]);
1434 eval(shift(@insns));
1435 eval(shift(@insns));
1436 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
1437 eval(shift(@insns));
1438 eval(shift(@insns));
1439 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
1440 eval(shift(@insns));
1441 eval(shift(@insns));
1442 &vpxor ($t3,$t3,$t2);
1443 eval(shift(@insns));
1444 eval(shift(@insns));
1445 eval(shift(@insns));
1446 eval(shift(@insns));
1447 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1448 eval(shift(@insns));
1449 eval(shift(@insns));
1450 eval(shift(@insns));
1451 eval(shift(@insns));
1452 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1453 eval(shift(@insns));
1454 eval(shift(@insns));
1455 eval(shift(@insns));
1456 eval(shift(@insns));
1457 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1458 foreach (@insns) { eval; } # remaining instructions
1459 &vmovdqa (16*$j."(%rsp)",$t2);
1462 for ($i=0,$j=0; $j<8; $j++) {
1463 &XOP_512_00_47($j,\&body_00_15,@X);
1464 push(@X,shift(@X)); # rotate(@X)
1466 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1467 &jne (".Lxop_00_47");
1469 for ($i=0; $i<16; ) {
1470 foreach(body_00_15()) { eval; }
1478 lea 16*$SZ($inp),$inp
1503 $code.=<<___ if ($win64);
1504 movaps 16*$SZ+32(%rsp),%xmm6
1505 movaps 16*$SZ+48(%rsp),%xmm7
1506 movaps 16*$SZ+64(%rsp),%xmm8
1507 movaps 16*$SZ+80(%rsp),%xmm9
1509 $code.=<<___ if ($win64 && $SZ>4);
1510 movaps 16*$SZ+96(%rsp),%xmm10
1511 movaps 16*$SZ+112(%rsp),%xmm11
1527 .cfi_def_cfa_register %rsp
1531 .size ${func}_xop,.-${func}_xop
1534 ######################################################################
1535 # AVX+shrd code path
1537 local *ror = sub { &shrd(@_[0],@_) };
1540 .type ${func}_avx,\@function,3
1545 mov %rsp,%rax # copy %rsp
1546 .cfi_def_cfa_register %rax
1559 shl \$4,%rdx # num*16
1560 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1561 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1562 and \$-64,%rsp # align stack frame
1563 mov $ctx,$_ctx # save ctx, 1st arg
1564 mov $inp,$_inp # save inp, 2nd arh
1565 mov %rdx,$_end # save end pointer, "3rd" arg
1566 mov %rax,$_rsp # save copy of %rsp
1567 .cfi_cfa_expression $_rsp,deref,+8
1569 $code.=<<___ if ($win64);
1570 movaps %xmm6,16*$SZ+32(%rsp)
1571 movaps %xmm7,16*$SZ+48(%rsp)
1572 movaps %xmm8,16*$SZ+64(%rsp)
1573 movaps %xmm9,16*$SZ+80(%rsp)
1575 $code.=<<___ if ($win64 && $SZ>4);
1576 movaps %xmm10,16*$SZ+96(%rsp)
1577 movaps %xmm11,16*$SZ+112(%rsp)
1592 if ($SZ==4) { # SHA256
1593 my @X = map("%xmm$_",(0..3));
1594 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1597 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1598 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1602 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1603 vmovdqu 0x00($inp),@X[0]
1604 vmovdqu 0x10($inp),@X[1]
1605 vmovdqu 0x20($inp),@X[2]
1606 vmovdqu 0x30($inp),@X[3]
1607 vpshufb $t3,@X[0],@X[0]
1608 lea $TABLE(%rip),$Tbl
1609 vpshufb $t3,@X[1],@X[1]
1610 vpshufb $t3,@X[2],@X[2]
1611 vpaddd 0x00($Tbl),@X[0],$t0
1612 vpshufb $t3,@X[3],@X[3]
1613 vpaddd 0x20($Tbl),@X[1],$t1
1614 vpaddd 0x40($Tbl),@X[2],$t2
1615 vpaddd 0x60($Tbl),@X[3],$t3
1616 vmovdqa $t0,0x00(%rsp)
1618 vmovdqa $t1,0x10(%rsp)
1620 vmovdqa $t2,0x20(%rsp)
1622 vmovdqa $t3,0x30(%rsp)
1628 sub \$`-16*2*$SZ`,$Tbl # size optimization
1630 sub Xupdate_256_AVX () {
1632 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
1633 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
1634 '&vpsrld ($t2,$t0,$sigma0[0]);',
1635 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
1636 '&vpsrld ($t3,$t0,$sigma0[2])',
1637 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
1638 '&vpxor ($t0,$t3,$t2)',
1639 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
1640 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1641 '&vpxor ($t0,$t0,$t1)',
1642 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1643 '&vpxor ($t0,$t0,$t2)',
1644 '&vpsrld ($t2,$t3,$sigma1[2]);',
1645 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
1646 '&vpsrlq ($t3,$t3,$sigma1[0]);',
1647 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
1648 '&vpxor ($t2,$t2,$t3);',
1649 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1650 '&vpxor ($t2,$t2,$t3)',
1651 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
1652 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
1653 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
1654 '&vpsrld ($t2,$t3,$sigma1[2])',
1655 '&vpsrlq ($t3,$t3,$sigma1[0])',
1656 '&vpxor ($t2,$t2,$t3);',
1657 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1658 '&vpxor ($t2,$t2,$t3)',
1659 '&vpshufb ($t2,$t2,$t5)',
1660 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
1664 sub AVX_256_00_47 () {
1668 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1670 foreach (Xupdate_256_AVX()) { # 29 instructions
1672 eval(shift(@insns));
1673 eval(shift(@insns));
1674 eval(shift(@insns));
1676 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1677 foreach (@insns) { eval; } # remaining instructions
1678 &vmovdqa (16*$j."(%rsp)",$t2);
1681 for ($i=0,$j=0; $j<4; $j++) {
1682 &AVX_256_00_47($j,\&body_00_15,@X);
1683 push(@X,shift(@X)); # rotate(@X)
1685 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1686 &jne (".Lavx_00_47");
1688 for ($i=0; $i<16; ) {
1689 foreach(body_00_15()) { eval; }
1693 my @X = map("%xmm$_",(0..7));
1694 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1700 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1701 vmovdqu 0x00($inp),@X[0]
1702 lea $TABLE+0x80(%rip),$Tbl # size optimization
1703 vmovdqu 0x10($inp),@X[1]
1704 vmovdqu 0x20($inp),@X[2]
1705 vpshufb $t3,@X[0],@X[0]
1706 vmovdqu 0x30($inp),@X[3]
1707 vpshufb $t3,@X[1],@X[1]
1708 vmovdqu 0x40($inp),@X[4]
1709 vpshufb $t3,@X[2],@X[2]
1710 vmovdqu 0x50($inp),@X[5]
1711 vpshufb $t3,@X[3],@X[3]
1712 vmovdqu 0x60($inp),@X[6]
1713 vpshufb $t3,@X[4],@X[4]
1714 vmovdqu 0x70($inp),@X[7]
1715 vpshufb $t3,@X[5],@X[5]
1716 vpaddq -0x80($Tbl),@X[0],$t0
1717 vpshufb $t3,@X[6],@X[6]
1718 vpaddq -0x60($Tbl),@X[1],$t1
1719 vpshufb $t3,@X[7],@X[7]
1720 vpaddq -0x40($Tbl),@X[2],$t2
1721 vpaddq -0x20($Tbl),@X[3],$t3
1722 vmovdqa $t0,0x00(%rsp)
1723 vpaddq 0x00($Tbl),@X[4],$t0
1724 vmovdqa $t1,0x10(%rsp)
1725 vpaddq 0x20($Tbl),@X[5],$t1
1726 vmovdqa $t2,0x20(%rsp)
1727 vpaddq 0x40($Tbl),@X[6],$t2
1728 vmovdqa $t3,0x30(%rsp)
1729 vpaddq 0x60($Tbl),@X[7],$t3
1730 vmovdqa $t0,0x40(%rsp)
1732 vmovdqa $t1,0x50(%rsp)
1734 vmovdqa $t2,0x60(%rsp)
1736 vmovdqa $t3,0x70(%rsp)
1742 add \$`16*2*$SZ`,$Tbl
1744 sub Xupdate_512_AVX () {
1746 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
1747 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
1748 '&vpsrlq ($t2,$t0,$sigma0[0])',
1749 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
1750 '&vpsrlq ($t3,$t0,$sigma0[2])',
1751 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
1752 '&vpxor ($t0,$t3,$t2)',
1753 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1754 '&vpxor ($t0,$t0,$t1)',
1755 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1756 '&vpxor ($t0,$t0,$t2)',
1757 '&vpsrlq ($t3,@X[7],$sigma1[2]);',
1758 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
1759 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
1760 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
1761 '&vpsrlq ($t1,@X[7],$sigma1[0]);',
1762 '&vpxor ($t3,$t3,$t2)',
1763 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
1764 '&vpxor ($t3,$t3,$t1)',
1765 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
1766 '&vpxor ($t3,$t3,$t2)',
1767 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
1768 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
1772 sub AVX_512_00_47 () {
1776 my @insns = (&$body,&$body); # 52 instructions
1778 foreach (Xupdate_512_AVX()) { # 23 instructions
1780 eval(shift(@insns));
1781 eval(shift(@insns));
1783 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1784 foreach (@insns) { eval; } # remaining instructions
1785 &vmovdqa (16*$j."(%rsp)",$t2);
1788 for ($i=0,$j=0; $j<8; $j++) {
1789 &AVX_512_00_47($j,\&body_00_15,@X);
1790 push(@X,shift(@X)); # rotate(@X)
1792 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1793 &jne (".Lavx_00_47");
1795 for ($i=0; $i<16; ) {
1796 foreach(body_00_15()) { eval; }
1804 lea 16*$SZ($inp),$inp
1829 $code.=<<___ if ($win64);
1830 movaps 16*$SZ+32(%rsp),%xmm6
1831 movaps 16*$SZ+48(%rsp),%xmm7
1832 movaps 16*$SZ+64(%rsp),%xmm8
1833 movaps 16*$SZ+80(%rsp),%xmm9
1835 $code.=<<___ if ($win64 && $SZ>4);
1836 movaps 16*$SZ+96(%rsp),%xmm10
1837 movaps 16*$SZ+112(%rsp),%xmm11
1853 .cfi_def_cfa_register %rsp
1857 .size ${func}_avx,.-${func}_avx
1861 ######################################################################
1862 # AVX2+BMI code path
1864 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
1868 sub bodyx_00_15 () {
1869 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1871 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1873 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
1874 '&and ($a4,$e)', # f&e
1875 '&rorx ($a0,$e,$Sigma1[2])',
1876 '&rorx ($a2,$e,$Sigma1[1])',
1878 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
1879 '&lea ($h,"($h,$a4)")',
1880 '&andn ($a4,$e,$g)', # ~e&g
1883 '&rorx ($a1,$e,$Sigma1[0])',
1884 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
1885 '&xor ($a0,$a1)', # Sigma1(e)
1888 '&rorx ($a4,$a,$Sigma0[2])',
1889 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
1890 '&xor ($a2,$b)', # a^b, b^c in next round
1891 '&rorx ($a1,$a,$Sigma0[1])',
1893 '&rorx ($a0,$a,$Sigma0[0])',
1894 '&lea ($d,"($d,$h)")', # d+=h
1895 '&and ($a3,$a2)', # (b^c)&(a^b)
1898 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
1899 '&xor ($a1,$a0)', # Sigma0(a)
1900 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
1901 '&mov ($a4,$e)', # copy of f in future
1903 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1905 # and at the finish one has to $a+=$a1
1909 .type ${func}_avx2,\@function,3
1914 mov %rsp,%rax # copy %rsp
1915 .cfi_def_cfa_register %rax
1928 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1929 shl \$4,%rdx # num*16
1930 and \$-256*$SZ,%rsp # align stack frame
1931 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1932 add \$`2*$SZ*($rounds-8)`,%rsp
1933 mov $ctx,$_ctx # save ctx, 1st arg
1934 mov $inp,$_inp # save inp, 2nd arh
1935 mov %rdx,$_end # save end pointer, "3rd" arg
1936 mov %rax,$_rsp # save copy of %rsp
1937 .cfi_cfa_expression $_rsp,deref,+8
1939 $code.=<<___ if ($win64);
1940 movaps %xmm6,16*$SZ+32(%rsp)
1941 movaps %xmm7,16*$SZ+48(%rsp)
1942 movaps %xmm8,16*$SZ+64(%rsp)
1943 movaps %xmm9,16*$SZ+80(%rsp)
1945 $code.=<<___ if ($win64 && $SZ>4);
1946 movaps %xmm10,16*$SZ+96(%rsp)
1947 movaps %xmm11,16*$SZ+112(%rsp)
1953 sub \$-16*$SZ,$inp # inp++, size optimization
1955 mov $inp,%r12 # borrow $T1
1957 cmp %rdx,$inp # $_end
1959 cmove %rsp,%r12 # next block or random data
1966 if ($SZ==4) { # SHA256
1967 my @X = map("%ymm$_",(0..3));
1968 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1971 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1972 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1976 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1977 vmovdqu -16*$SZ+0($inp),%xmm0
1978 vmovdqu -16*$SZ+16($inp),%xmm1
1979 vmovdqu -16*$SZ+32($inp),%xmm2
1980 vmovdqu -16*$SZ+48($inp),%xmm3
1981 #mov $inp,$_inp # offload $inp
1982 vinserti128 \$1,(%r12),@X[0],@X[0]
1983 vinserti128 \$1,16(%r12),@X[1],@X[1]
1984 vpshufb $t3,@X[0],@X[0]
1985 vinserti128 \$1,32(%r12),@X[2],@X[2]
1986 vpshufb $t3,@X[1],@X[1]
1987 vinserti128 \$1,48(%r12),@X[3],@X[3]
1989 lea $TABLE(%rip),$Tbl
1990 vpshufb $t3,@X[2],@X[2]
1991 vpaddd 0x00($Tbl),@X[0],$t0
1992 vpshufb $t3,@X[3],@X[3]
1993 vpaddd 0x20($Tbl),@X[1],$t1
1994 vpaddd 0x40($Tbl),@X[2],$t2
1995 vpaddd 0x60($Tbl),@X[3],$t3
1996 vmovdqa $t0,0x00(%rsp)
1998 vmovdqa $t1,0x20(%rsp)
2000 $code.=<<___ if (!$win64);
2001 # temporarily use %rdi as frame pointer
2006 lea -$PUSH8(%rsp),%rsp
2008 $code.=<<___ if (!$win64);
2009 # the frame info is at $_rsp, but the stack is moving...
2010 # so a second frame pointer is saved at -8(%rsp)
2011 # that is in the red zone
2013 .cfi_cfa_expression %rsp-8,deref,+8
2017 vmovdqa $t2,0x00(%rsp)
2019 vmovdqa $t3,0x20(%rsp)
2021 sub \$-16*2*$SZ,$Tbl # size optimization
2028 sub AVX2_256_00_47 () {
2032 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
2033 my $base = "+2*$PUSH8(%rsp)";
2036 &lea ("%rsp","-$PUSH8(%rsp)");
2037 $code.=<<___ if (!$win64);
2038 .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
2039 # copy secondary frame pointer to new location again at -8(%rsp)
2040 pushq $PUSH8-8(%rsp)
2041 .cfi_cfa_expression %rsp,deref,+8
2043 .cfi_cfa_expression %rsp-8,deref,+8
2047 foreach (Xupdate_256_AVX()) { # 29 instructions
2049 eval(shift(@insns));
2050 eval(shift(@insns));
2051 eval(shift(@insns));
2053 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
2054 foreach (@insns) { eval; } # remaining instructions
2055 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2058 for ($i=0,$j=0; $j<4; $j++) {
2059 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
2060 push(@X,shift(@X)); # rotate(@X)
2062 &lea ($Tbl,16*2*$SZ."($Tbl)");
2063 &cmpb (($SZ-1)."($Tbl)",0);
2064 &jne (".Lavx2_00_47");
2066 for ($i=0; $i<16; ) {
2067 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2068 foreach(bodyx_00_15()) { eval; }
2071 my @X = map("%ymm$_",(0..7));
2072 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
2078 vmovdqu -16*$SZ($inp),%xmm0
2079 vmovdqu -16*$SZ+16($inp),%xmm1
2080 vmovdqu -16*$SZ+32($inp),%xmm2
2081 lea $TABLE+0x80(%rip),$Tbl # size optimization
2082 vmovdqu -16*$SZ+48($inp),%xmm3
2083 vmovdqu -16*$SZ+64($inp),%xmm4
2084 vmovdqu -16*$SZ+80($inp),%xmm5
2085 vmovdqu -16*$SZ+96($inp),%xmm6
2086 vmovdqu -16*$SZ+112($inp),%xmm7
2087 #mov $inp,$_inp # offload $inp
2088 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
2089 vinserti128 \$1,(%r12),@X[0],@X[0]
2090 vinserti128 \$1,16(%r12),@X[1],@X[1]
2091 vpshufb $t2,@X[0],@X[0]
2092 vinserti128 \$1,32(%r12),@X[2],@X[2]
2093 vpshufb $t2,@X[1],@X[1]
2094 vinserti128 \$1,48(%r12),@X[3],@X[3]
2095 vpshufb $t2,@X[2],@X[2]
2096 vinserti128 \$1,64(%r12),@X[4],@X[4]
2097 vpshufb $t2,@X[3],@X[3]
2098 vinserti128 \$1,80(%r12),@X[5],@X[5]
2099 vpshufb $t2,@X[4],@X[4]
2100 vinserti128 \$1,96(%r12),@X[6],@X[6]
2101 vpshufb $t2,@X[5],@X[5]
2102 vinserti128 \$1,112(%r12),@X[7],@X[7]
2104 vpaddq -0x80($Tbl),@X[0],$t0
2105 vpshufb $t2,@X[6],@X[6]
2106 vpaddq -0x60($Tbl),@X[1],$t1
2107 vpshufb $t2,@X[7],@X[7]
2108 vpaddq -0x40($Tbl),@X[2],$t2
2109 vpaddq -0x20($Tbl),@X[3],$t3
2110 vmovdqa $t0,0x00(%rsp)
2111 vpaddq 0x00($Tbl),@X[4],$t0
2112 vmovdqa $t1,0x20(%rsp)
2113 vpaddq 0x20($Tbl),@X[5],$t1
2114 vmovdqa $t2,0x40(%rsp)
2115 vpaddq 0x40($Tbl),@X[6],$t2
2116 vmovdqa $t3,0x60(%rsp)
2118 $code.=<<___ if (!$win64);
2119 # temporarily use %rdi as frame pointer
2124 lea -$PUSH8(%rsp),%rsp
2126 $code.=<<___ if (!$win64);
2127 # the frame info is at $_rsp, but the stack is moving...
2128 # so a second frame pointer is saved at -8(%rsp)
2129 # that is in the red zone
2131 .cfi_cfa_expression %rsp-8,deref,+8
2134 vpaddq 0x60($Tbl),@X[7],$t3
2135 vmovdqa $t0,0x00(%rsp)
2137 vmovdqa $t1,0x20(%rsp)
2139 vmovdqa $t2,0x40(%rsp)
2141 vmovdqa $t3,0x60(%rsp)
2150 sub AVX2_512_00_47 () {
2154 my @insns = (&$body,&$body); # 48 instructions
2155 my $base = "+2*$PUSH8(%rsp)";
2158 &lea ("%rsp","-$PUSH8(%rsp)");
2159 $code.=<<___ if (!$win64);
2160 .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
2161 # copy secondary frame pointer to new location again at -8(%rsp)
2162 pushq $PUSH8-8(%rsp)
2163 .cfi_cfa_expression %rsp,deref,+8
2165 .cfi_cfa_expression %rsp-8,deref,+8
2169 foreach (Xupdate_512_AVX()) { # 23 instructions
2172 eval(shift(@insns));
2173 eval(shift(@insns));
2174 eval(shift(@insns));
2177 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
2178 foreach (@insns) { eval; } # remaining instructions
2179 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
2182 for ($i=0,$j=0; $j<8; $j++) {
2183 &AVX2_512_00_47($j,\&bodyx_00_15,@X);
2184 push(@X,shift(@X)); # rotate(@X)
2186 &lea ($Tbl,16*2*$SZ."($Tbl)");
2187 &cmpb (($SZ-1-0x80)."($Tbl)",0);
2188 &jne (".Lavx2_00_47");
2190 for ($i=0; $i<16; ) {
2191 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
2192 foreach(bodyx_00_15()) { eval; }
2196 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2198 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2199 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
2219 cmp `$PUSH8+2*8`($Tbl),$inp # $_end
2230 for ($i=0; $i<8; ) {
2231 my $base="+16($Tbl)";
2232 foreach(bodyx_00_15()) { eval; }
2235 lea -$PUSH8($Tbl),$Tbl
2239 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
2241 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
2242 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
2243 # restore frame pointer to original location at $_rsp
2244 .cfi_cfa_expression $_rsp,deref,+8
2252 lea `2*16*$SZ`($inp),$inp # inp+=2
2259 cmove %rsp,%r12 # next block or stale data
2270 # temporarily use $Tbl as index to $_rsp
2271 # this avoids the need to save a secondary frame pointer at -8(%rsp)
2272 .cfi_cfa_expression $Tbl+`16*$SZ+3*8`,deref,+8
2275 mov `16*$SZ+3*8`($Tbl),%rsi
2279 $code.=<<___ if ($win64);
2280 movaps 16*$SZ+32($Tbl),%xmm6
2281 movaps 16*$SZ+48($Tbl),%xmm7
2282 movaps 16*$SZ+64($Tbl),%xmm8
2283 movaps 16*$SZ+80($Tbl),%xmm9
2285 $code.=<<___ if ($win64 && $SZ>4);
2286 movaps 16*$SZ+96($Tbl),%xmm10
2287 movaps 16*$SZ+112($Tbl),%xmm11
2303 .cfi_def_cfa_register %rsp
2307 .size ${func}_avx2,.-${func}_avx2
2312 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2313 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2321 .extern __imp_RtlVirtualUnwind
2322 .type se_handler,\@abi-omnipotent
2336 mov 120($context),%rax # pull context->Rax
2337 mov 248($context),%rbx # pull context->Rip
2339 mov 8($disp),%rsi # disp->ImageBase
2340 mov 56($disp),%r11 # disp->HanderlData
2342 mov 0(%r11),%r10d # HandlerData[0]
2343 lea (%rsi,%r10),%r10 # prologue label
2344 cmp %r10,%rbx # context->Rip<prologue label
2347 mov 152($context),%rax # pull context->Rsp
2349 mov 4(%r11),%r10d # HandlerData[1]
2350 lea (%rsi,%r10),%r10 # epilogue label
2351 cmp %r10,%rbx # context->Rip>=epilogue label
2354 $code.=<<___ if ($avx>1);
2355 lea .Lavx2_shortcut(%rip),%r10
2356 cmp %r10,%rbx # context->Rip<avx2_shortcut
2360 add \$`2*$SZ*($rounds-8)`,%rax
2364 mov %rax,%rsi # put aside Rsp
2365 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
2373 mov %rbx,144($context) # restore context->Rbx
2374 mov %rbp,160($context) # restore context->Rbp
2375 mov %r12,216($context) # restore context->R12
2376 mov %r13,224($context) # restore context->R13
2377 mov %r14,232($context) # restore context->R14
2378 mov %r15,240($context) # restore context->R15
2380 lea .Lepilogue(%rip),%r10
2382 jb .Lin_prologue # non-AVX code
2384 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
2385 lea 512($context),%rdi # &context.Xmm6
2386 mov \$`$SZ==4?8:12`,%ecx
2387 .long 0xa548f3fc # cld; rep movsq
2392 mov %rax,152($context) # restore context->Rsp
2393 mov %rsi,168($context) # restore context->Rsi
2394 mov %rdi,176($context) # restore context->Rdi
2396 mov 40($disp),%rdi # disp->ContextRecord
2397 mov $context,%rsi # context
2398 mov \$154,%ecx # sizeof(CONTEXT)
2399 .long 0xa548f3fc # cld; rep movsq
2402 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2403 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2404 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2405 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2406 mov 40(%rsi),%r10 # disp->ContextRecord
2407 lea 56(%rsi),%r11 # &disp->HandlerData
2408 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2409 mov %r10,32(%rsp) # arg5
2410 mov %r11,40(%rsp) # arg6
2411 mov %r12,48(%rsp) # arg7
2412 mov %rcx,56(%rsp) # arg8, (NULL)
2413 call *__imp_RtlVirtualUnwind(%rip)
2415 mov \$1,%eax # ExceptionContinueSearch
2427 .size se_handler,.-se_handler
2430 $code.=<<___ if ($SZ==4 && $shaext);
2431 .type shaext_handler,\@abi-omnipotent
2445 mov 120($context),%rax # pull context->Rax
2446 mov 248($context),%rbx # pull context->Rip
2448 lea .Lprologue_shaext(%rip),%r10
2449 cmp %r10,%rbx # context->Rip<.Lprologue
2452 lea .Lepilogue_shaext(%rip),%r10
2453 cmp %r10,%rbx # context->Rip>=.Lepilogue
2456 lea -8-5*16(%rax),%rsi
2457 lea 512($context),%rdi # &context.Xmm6
2459 .long 0xa548f3fc # cld; rep movsq
2462 .size shaext_handler,.-shaext_handler
2468 .rva .LSEH_begin_$func
2469 .rva .LSEH_end_$func
2470 .rva .LSEH_info_$func
2472 $code.=<<___ if ($SZ==4 && $shaext);
2473 .rva .LSEH_begin_${func}_shaext
2474 .rva .LSEH_end_${func}_shaext
2475 .rva .LSEH_info_${func}_shaext
2477 $code.=<<___ if ($SZ==4);
2478 .rva .LSEH_begin_${func}_ssse3
2479 .rva .LSEH_end_${func}_ssse3
2480 .rva .LSEH_info_${func}_ssse3
2482 $code.=<<___ if ($avx && $SZ==8);
2483 .rva .LSEH_begin_${func}_xop
2484 .rva .LSEH_end_${func}_xop
2485 .rva .LSEH_info_${func}_xop
2487 $code.=<<___ if ($avx);
2488 .rva .LSEH_begin_${func}_avx
2489 .rva .LSEH_end_${func}_avx
2490 .rva .LSEH_info_${func}_avx
2492 $code.=<<___ if ($avx>1);
2493 .rva .LSEH_begin_${func}_avx2
2494 .rva .LSEH_end_${func}_avx2
2495 .rva .LSEH_info_${func}_avx2
2503 .rva .Lprologue,.Lepilogue # HandlerData[]
2505 $code.=<<___ if ($SZ==4 && $shaext);
2506 .LSEH_info_${func}_shaext:
2510 $code.=<<___ if ($SZ==4);
2511 .LSEH_info_${func}_ssse3:
2514 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
2516 $code.=<<___ if ($avx && $SZ==8);
2517 .LSEH_info_${func}_xop:
2520 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
2522 $code.=<<___ if ($avx);
2523 .LSEH_info_${func}_avx:
2526 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
2528 $code.=<<___ if ($avx>1);
2529 .LSEH_info_${func}_avx2:
2532 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2539 "sha256rnds2" => 0xcb,
2540 "sha256msg1" => 0xcc,
2541 "sha256msg2" => 0xcd );
2543 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
2544 my @opcode=(0x0f,0x38);
2545 push @opcode,$opcodelet{$instr};
2546 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
2547 return ".byte\t".join(',',@opcode);
2549 return $instr."\t".@_[0];
2553 foreach (split("\n",$code)) {
2554 s/\`([^\`]*)\`/eval $1/geo;
2556 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
2560 close STDOUT or die "error closing STDOUT: $!";