3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. Rights for redistribution and usage in source and binary
6 # forms are granted according to the OpenSSL license.
7 # ====================================================================
9 # sha256/512_block procedure for x86_64.
11 # 40% improvement over compiler-generated code on Opteron. On EM64T
12 # sha256 was observed to run >80% faster and sha512 - >40%. No magical
13 # tricks, just straight implementation... I really wonder why gcc
14 # [being armed with inline assembler] fails to generate as fast code.
15 # The only thing which is cool about this module is that it's very
16 # same instruction sequence used for both SHA-256 and SHA-512. In
17 # former case the instructions operate on 32-bit operands, while in
18 # latter - on 64-bit ones. All I had to do is to get one flavor right,
19 # the other one passed the test right away:-)
21 # sha256_block runs in ~1005 cycles on Opteron, which gives you
22 # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
23 # frequency in GHz. sha512_block runs in ~1275 cycles, which results
24 # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
25 # Well, if you compare it to IA-64 implementation, which maintains
26 # X[16] in register bank[!], tends to 4 instructions per CPU clock
27 # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
28 # issue Opteron pipeline and X[16] maintained in memory. So that *if*
29 # there is a way to improve it, *then* the only way would be to try to
30 # offload X[16] updates to SSE unit, but that would require "deeper"
31 # loop unroll, which in turn would naturally cause size blow-up, not
32 # to mention increased complexity! And once again, only *if* it's
33 # actually possible to noticeably improve overall ILP, instruction
34 # level parallelism, on a given CPU implementation in this case.
36 # Special note on Intel EM64T. While Opteron CPU exhibits perfect
37 # perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
38 # [currently available] EM64T CPUs apparently are far from it. On the
39 # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
40 # sha256_block:-( This is presumably because 64-bit shifts/rotates
41 # apparently are not atomic instructions, but implemented in microcode.
45 # Optimization including one of Pavel Semjanov's ideas, alternative
46 # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
47 # unfortunately -2% SHA512 on P4 [which nobody should care about
52 # Add SIMD code paths, see below for improvement coefficients. SSSE3
53 # code path was not attempted for SHA512, because improvement is not
54 # estimated to be high enough, noticeably less than 9%, to justify
55 # the effort, not on pre-AVX processors. [Obviously with exclusion
56 # for VIA Nano, but it has SHA512 instruction that is faster and
57 # should be used instead.] For reference, corresponding estimated
58 # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
59 # higher coefficients are observed on VIA Nano and Bulldozer has more
60 # to do with specifics of their architecture [which is topic for
61 # separate discussion].
65 # Add AVX2 code path. Two consecutive input blocks are loaded to
66 # 256-bit %ymm registers, with data from first block to least
67 # significant 128-bit halves and data from second to most significant.
68 # The data is then processed with same SIMD instruction sequence as
69 # for AVX, but with %ymm as operands. Side effect is increased stack
70 # frame, 448 additional bytes in SHA256 and 1152 in SHA512.
72 ######################################################################
73 # Current performance in cycles per processed byte (less is better):
75 # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
77 # AMD K8 14.9 - - 9.57 -
79 # Core 2 15.6 13.8(+13%) - 9.97 -
80 # Westmere 14.8 12.3(+19%) - 9.58 -
81 # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
82 # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
83 # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
84 # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
85 # VIA Nano 23.0 16.5(+39%) - 14.7 -
86 # Atom 23.0 18.9(+22%) - 14.7 -
88 # (*) whichever best applicable;
89 # (**) switch from ror to shrd stands for fair share of improvement;
90 # (***) execution time is fully determined by remaining integer-only
91 # part, body_00_15; reducing the amount of SIMD instructions
92 # below certain limit makes no difference/sense; to conserve
93 # space SHA256 XOP code path is therefore omitted;
97 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
99 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
101 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
102 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
103 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
104 die "can't locate x86_64-xlate.pl";
106 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
107 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
108 $avx = ($1>=2.19) + ($1>=2.22);
111 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
112 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
113 $avx = ($1>=2.09) + ($1>=2.10);
116 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
117 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
118 $avx = ($1>=10) + ($1>=11);
121 open OUT,"| \"$^X\" $xlate $flavour $output";
124 if ($output =~ /512/) {
125 $func="sha512_block_data_order";
128 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
129 "%r8", "%r9", "%r10","%r11");
130 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
137 $func="sha256_block_data_order";
140 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
141 "%r8d","%r9d","%r10d","%r11d");
142 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
150 $ctx="%rdi"; # 1st arg, zapped by $a3
151 $inp="%rsi"; # 2nd arg
154 $_ctx="16*$SZ+0*8(%rsp)";
155 $_inp="16*$SZ+1*8(%rsp)";
156 $_end="16*$SZ+2*8(%rsp)";
157 $_rsp="16*$SZ+3*8(%rsp)";
158 $framesz="16*$SZ+4*8";
162 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
164 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
167 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
171 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
174 mov $T1,`$SZ*($i&0xf)`(%rsp)
178 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
180 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
182 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
184 add $a2,$T1 # T1+=Ch(e,f,g)
187 add ($Tbl),$T1 # T1+=K[round]
190 xor $b,$a2 # a^b, b^c in next round
191 ror \$$Sigma1[0],$a0 # Sigma1(e)
195 ror \$$Sigma0[0],$a1 # Sigma0(a)
196 add $a0,$T1 # T1+=Sigma1(e)
198 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
202 lea $STRIDE($Tbl),$Tbl # round++
204 $code.=<<___ if ($i<15);
205 add $a1,$h # h+=Sigma0(a)
207 ($a2,$a3) = ($a3,$a2);
211 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
214 mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
215 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
218 ror \$`$sigma0[1]-$sigma0[0]`,$a0
219 add $a1,$a # modulo-scheduled h+=Sigma0(a)
221 ror \$`$sigma1[1]-$sigma1[0]`,$a2
230 xor $a0,$T1 # sigma0(X[(i+1)&0xf])
231 xor $a1,$a2 # sigma1(X[(i+14)&0xf])
232 add `$SZ*(($i+9)&0xf)`(%rsp),$T1
234 add `$SZ*($i&0xf)`(%rsp),$T1
245 .extern OPENSSL_ia32cap_P
247 .type $func,\@function,3
251 $code.=<<___ if ($SZ==4 || $avx);
252 lea OPENSSL_ia32cap_P(%rip),%r11
257 $code.=<<___ if ($avx && $SZ==8);
258 test \$`1<<11`,%r10d # check for XOP
261 $code.=<<___ if ($avx>1);
262 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
263 cmp \$`1<<8|1<<5|1<<3`,%r11d
266 $code.=<<___ if ($avx);
267 and \$`1<<30`,%r9d # mask "Intel CPU" bit
268 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
270 cmp \$`1<<28|1<<9|1<<30`,%r10d
273 $code.=<<___ if ($SZ==4);
284 mov %rsp,%r11 # copy %rsp
285 shl \$4,%rdx # num*16
287 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
288 and \$-64,%rsp # align stack frame
289 mov $ctx,$_ctx # save ctx, 1st arg
290 mov $inp,$_inp # save inp, 2nd arh
291 mov %rdx,$_end # save end pointer, "3rd" arg
292 mov %r11,$_rsp # save copy of %rsp
308 lea $TABLE(%rip),$Tbl
311 for($i=0;$i<16;$i++) {
312 $code.=" mov $SZ*$i($inp),$T1\n";
313 $code.=" mov @ROT[4],$a0\n";
314 $code.=" mov @ROT[0],$a1\n";
315 $code.=" bswap $T1\n";
316 &ROUND_00_15($i,@ROT);
317 unshift(@ROT,pop(@ROT));
325 &ROUND_16_XX($i,@ROT);
326 unshift(@ROT,pop(@ROT));
330 cmpb \$0,`$SZ-1`($Tbl)
334 add $a1,$A # modulo-scheduled h+=Sigma0(a)
335 lea 16*$SZ($inp),$inp
374 .type $TABLE,\@object
376 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
377 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
378 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
379 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
380 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
381 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
382 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
383 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
384 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
385 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
386 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
387 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
388 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
389 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
390 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
391 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
392 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
393 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
394 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
395 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
396 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
397 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
398 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
399 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
400 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
401 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
402 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
403 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
404 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
405 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
406 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
407 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
409 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
410 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
411 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
412 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
413 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
414 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
415 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
420 .type $TABLE,\@object
422 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
423 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
424 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
425 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
426 .quad 0x3956c25bf348b538,0x59f111f1b605d019
427 .quad 0x3956c25bf348b538,0x59f111f1b605d019
428 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
429 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
430 .quad 0xd807aa98a3030242,0x12835b0145706fbe
431 .quad 0xd807aa98a3030242,0x12835b0145706fbe
432 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
433 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
434 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
435 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
436 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
437 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
438 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
439 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
440 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
441 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
442 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
443 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
444 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
445 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
446 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
447 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
448 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
449 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
450 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
451 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
452 .quad 0x06ca6351e003826f,0x142929670a0e6e70
453 .quad 0x06ca6351e003826f,0x142929670a0e6e70
454 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
455 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
456 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
457 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
458 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
459 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
460 .quad 0x81c2c92e47edaee6,0x92722c851482353b
461 .quad 0x81c2c92e47edaee6,0x92722c851482353b
462 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
463 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
464 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
465 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
466 .quad 0xd192e819d6ef5218,0xd69906245565a910
467 .quad 0xd192e819d6ef5218,0xd69906245565a910
468 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
469 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
470 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
471 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
472 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
473 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
474 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
475 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
476 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
477 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
478 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
479 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
480 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
481 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
482 .quad 0x90befffa23631e28,0xa4506cebde82bde9
483 .quad 0x90befffa23631e28,0xa4506cebde82bde9
484 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
485 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
486 .quad 0xca273eceea26619c,0xd186b8c721c0c207
487 .quad 0xca273eceea26619c,0xd186b8c721c0c207
488 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
489 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
490 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
491 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
492 .quad 0x113f9804bef90dae,0x1b710b35131c471b
493 .quad 0x113f9804bef90dae,0x1b710b35131c471b
494 .quad 0x28db77f523047d84,0x32caab7b40c72493
495 .quad 0x28db77f523047d84,0x32caab7b40c72493
496 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
497 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
498 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
499 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
500 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
501 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
503 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
504 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
505 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
509 ######################################################################
515 my ($a,$b,$c,$d,$e,$f,$g,$h);
517 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
518 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
520 $arg = "\$$arg" if ($arg*1 eq $arg);
521 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
526 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
528 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
532 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
534 '&xor ($a4,$g)', # f^g
536 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
538 '&and ($a4,$e)', # (f^g)&e
541 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
544 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
545 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
546 '&xor ($a2,$b)', # a^b, b^c in next round
548 '&add ($h,$a4)', # h+=Ch(e,f,g)
549 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
550 '&and ($a3,$a2)', # (b^c)&(a^b)
553 '&add ($h,$a0)', # h+=Sigma1(e)
554 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
556 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
557 '&add ($d,$h)', # d+=h
558 '&add ($h,$a3)', # h+=Maj(a,b,c)
561 '&add ($a1,$h);'. # h+=Sigma0(a)
562 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
566 ######################################################################
569 if ($SZ==4) { # SHA256 only
570 my @X = map("%xmm$_",(0..3));
571 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
574 .type ${func}_ssse3,\@function,3
584 mov %rsp,%r11 # copy %rsp
585 shl \$4,%rdx # num*16
586 sub \$`$framesz+$win64*16*4`,%rsp
587 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
588 and \$-64,%rsp # align stack frame
589 mov $ctx,$_ctx # save ctx, 1st arg
590 mov $inp,$_inp # save inp, 2nd arh
591 mov %rdx,$_end # save end pointer, "3rd" arg
592 mov %r11,$_rsp # save copy of %rsp
594 $code.=<<___ if ($win64);
595 movaps %xmm6,16*$SZ+32(%rsp)
596 movaps %xmm7,16*$SZ+48(%rsp)
597 movaps %xmm8,16*$SZ+64(%rsp)
598 movaps %xmm9,16*$SZ+80(%rsp)
614 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
615 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
619 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
620 movdqu 0x00($inp),@X[0]
621 movdqu 0x10($inp),@X[1]
622 movdqu 0x20($inp),@X[2]
623 movdqu 0x30($inp),@X[3]
625 lea $TABLE(%rip),$Tbl
627 movdqa 0x00($Tbl),$t0
629 movdqa 0x20($Tbl),$t1
631 movdqa 0x40($Tbl),$t2
633 movdqa 0x60($Tbl),$t3
637 movdqa $t0,0x00(%rsp)
639 movdqa $t1,0x10(%rsp)
641 movdqa $t2,0x20(%rsp)
643 movdqa $t3,0x30(%rsp)
649 sub \$`-16*2*$SZ`,$Tbl # size optimization
651 sub Xupdate_256_SSSE3 () {
653 '&movdqa ($t0,@X[1]);',
654 '&movdqa ($t3,@X[3])',
655 '&palignr ($t0,@X[0],$SZ)', # X[1..4]
656 '&palignr ($t3,@X[2],$SZ);', # X[9..12]
658 '&movdqa ($t2,$t0);',
659 '&psrld ($t0,$sigma0[2])',
660 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
661 '&psrld ($t2,$sigma0[0])',
662 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
663 '&pslld ($t1,8*$SZ-$sigma0[1]);'.
665 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
667 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
670 '&pxor ($t0,$t1);', # sigma0(X[1..4])
671 '&psrld ($t3,$sigma1[2])',
672 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
673 '&psrlq ($t2,$sigma1[0])',
675 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
677 '&pshufb ($t3,$t4)', # sigma1(X[14..15])
678 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
679 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
680 '&movdqa ($t2,$t3);',
681 '&psrld ($t3,$sigma1[2])',
682 '&psrlq ($t2,$sigma1[0])',
684 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
686 '&movdqa ($t2,16*2*$j."($Tbl)")',
688 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
692 sub SSSE3_256_00_47 () {
696 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
699 foreach (Xupdate_256_SSSE3()) { # 36 instructions
705 } else { # squeeze extra 4% on Westmere and 19% on Atom
706 eval(shift(@insns)); #@
711 eval(shift(@insns)); #@
714 eval(shift(@insns)); #@
716 &palignr ($t0,@X[0],$SZ); # X[1..4]
719 &palignr ($t3,@X[2],$SZ); # X[9..12]
723 eval(shift(@insns)); #@
728 eval(shift(@insns)); #@
730 &psrld ($t0,$sigma0[2]);
734 &paddd (@X[0],$t3); # X[0..3] += X[9..12]
735 eval(shift(@insns)); #@
737 &psrld ($t2,$sigma0[0]);
740 &pshufd ($t3,@X[3],0b11111010); # X[4..15]
742 eval(shift(@insns)); #@
743 &pslld ($t1,8*$SZ-$sigma0[1]);
747 eval(shift(@insns)); #@
750 eval(shift(@insns)); #@
751 &psrld ($t2,$sigma0[1]-$sigma0[0]);
756 &pslld ($t1,$sigma0[1]-$sigma0[0]);
761 eval(shift(@insns)); #@
765 &pxor ($t0,$t1); # sigma0(X[1..4])
766 eval(shift(@insns)); #@
769 &psrld ($t3,$sigma1[2]);
772 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
773 eval(shift(@insns)); #@
775 &psrlq ($t2,$sigma1[0]);
780 eval(shift(@insns)); #@
783 eval(shift(@insns)); #@
784 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
788 eval(shift(@insns)); #@
791 #&pshufb ($t3,$t4); # sigma1(X[14..15])
792 &pshufd ($t3,$t3,0b10000000);
798 eval(shift(@insns)); #@
801 eval(shift(@insns)); #@
802 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
806 &pshufd ($t3,@X[0],0b01010000); # X[16..17]
808 eval(shift(@insns)); #@
813 &psrld ($t3,$sigma1[2]);
815 eval(shift(@insns)); #@
816 &psrlq ($t2,$sigma1[0]);
820 eval(shift(@insns)); #@
823 eval(shift(@insns)); #@
825 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
832 eval(shift(@insns)); #@
834 &pshufd ($t3,$t3,0b00001000);
837 &movdqa ($t2,16*2*$j."($Tbl)");
838 eval(shift(@insns)); #@
844 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
845 eval(shift(@insns)); #@
850 foreach (@insns) { eval; } # remaining instructions
851 &movdqa (16*$j."(%rsp)",$t2);
854 for ($i=0,$j=0; $j<4; $j++) {
855 &SSSE3_256_00_47($j,\&body_00_15,@X);
856 push(@X,shift(@X)); # rotate(@X)
858 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
859 &jne (".Lssse3_00_47");
861 for ($i=0; $i<16; ) {
862 foreach(body_00_15()) { eval; }
869 lea 16*$SZ($inp),$inp
892 $code.=<<___ if ($win64);
893 movaps 16*$SZ+32(%rsp),%xmm6
894 movaps 16*$SZ+48(%rsp),%xmm7
895 movaps 16*$SZ+64(%rsp),%xmm8
896 movaps 16*$SZ+80(%rsp),%xmm9
908 .size ${func}_ssse3,.-${func}_ssse3
913 ######################################################################
916 if ($SZ==8) { # SHA512 only
918 .type ${func}_xop,\@function,3
928 mov %rsp,%r11 # copy %rsp
929 shl \$4,%rdx # num*16
930 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
931 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
932 and \$-64,%rsp # align stack frame
933 mov $ctx,$_ctx # save ctx, 1st arg
934 mov $inp,$_inp # save inp, 2nd arh
935 mov %rdx,$_end # save end pointer, "3rd" arg
936 mov %r11,$_rsp # save copy of %rsp
938 $code.=<<___ if ($win64);
939 movaps %xmm6,16*$SZ+32(%rsp)
940 movaps %xmm7,16*$SZ+48(%rsp)
941 movaps %xmm8,16*$SZ+64(%rsp)
942 movaps %xmm9,16*$SZ+80(%rsp)
944 $code.=<<___ if ($win64 && $SZ>4);
945 movaps %xmm10,16*$SZ+96(%rsp)
946 movaps %xmm11,16*$SZ+112(%rsp)
962 if ($SZ==4) { # SHA256
963 my @X = map("%xmm$_",(0..3));
964 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
969 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
970 vmovdqu 0x00($inp),@X[0]
971 vmovdqu 0x10($inp),@X[1]
972 vmovdqu 0x20($inp),@X[2]
973 vmovdqu 0x30($inp),@X[3]
974 vpshufb $t3,@X[0],@X[0]
975 lea $TABLE(%rip),$Tbl
976 vpshufb $t3,@X[1],@X[1]
977 vpshufb $t3,@X[2],@X[2]
978 vpaddd 0x00($Tbl),@X[0],$t0
979 vpshufb $t3,@X[3],@X[3]
980 vpaddd 0x20($Tbl),@X[1],$t1
981 vpaddd 0x40($Tbl),@X[2],$t2
982 vpaddd 0x60($Tbl),@X[3],$t3
983 vmovdqa $t0,0x00(%rsp)
985 vmovdqa $t1,0x10(%rsp)
987 vmovdqa $t2,0x20(%rsp)
989 vmovdqa $t3,0x30(%rsp)
995 sub \$`-16*2*$SZ`,$Tbl # size optimization
997 sub XOP_256_00_47 () {
1001 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1003 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
1004 eval(shift(@insns));
1005 eval(shift(@insns));
1006 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
1007 eval(shift(@insns));
1008 eval(shift(@insns));
1009 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
1010 eval(shift(@insns));
1011 eval(shift(@insns));
1012 &vpsrld ($t0,$t0,$sigma0[2]);
1013 eval(shift(@insns));
1014 eval(shift(@insns));
1015 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
1016 eval(shift(@insns));
1017 eval(shift(@insns));
1018 eval(shift(@insns));
1019 eval(shift(@insns));
1020 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
1021 eval(shift(@insns));
1022 eval(shift(@insns));
1023 &vpxor ($t0,$t0,$t1);
1024 eval(shift(@insns));
1025 eval(shift(@insns));
1026 eval(shift(@insns));
1027 eval(shift(@insns));
1028 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
1029 eval(shift(@insns));
1030 eval(shift(@insns));
1031 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
1032 eval(shift(@insns));
1033 eval(shift(@insns));
1034 &vpsrld ($t2,@X[3],$sigma1[2]);
1035 eval(shift(@insns));
1036 eval(shift(@insns));
1037 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
1038 eval(shift(@insns));
1039 eval(shift(@insns));
1040 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1041 eval(shift(@insns));
1042 eval(shift(@insns));
1043 &vpxor ($t3,$t3,$t2);
1044 eval(shift(@insns));
1045 eval(shift(@insns));
1046 eval(shift(@insns));
1047 eval(shift(@insns));
1048 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1049 eval(shift(@insns));
1050 eval(shift(@insns));
1051 eval(shift(@insns));
1052 eval(shift(@insns));
1053 &vpsrldq ($t3,$t3,8);
1054 eval(shift(@insns));
1055 eval(shift(@insns));
1056 eval(shift(@insns));
1057 eval(shift(@insns));
1058 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1059 eval(shift(@insns));
1060 eval(shift(@insns));
1061 eval(shift(@insns));
1062 eval(shift(@insns));
1063 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
1064 eval(shift(@insns));
1065 eval(shift(@insns));
1066 &vpsrld ($t2,@X[0],$sigma1[2]);
1067 eval(shift(@insns));
1068 eval(shift(@insns));
1069 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1070 eval(shift(@insns));
1071 eval(shift(@insns));
1072 &vpxor ($t3,$t3,$t2);
1073 eval(shift(@insns));
1074 eval(shift(@insns));
1075 eval(shift(@insns));
1076 eval(shift(@insns));
1077 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
1078 eval(shift(@insns));
1079 eval(shift(@insns));
1080 eval(shift(@insns));
1081 eval(shift(@insns));
1082 &vpslldq ($t3,$t3,8); # 22 instructions
1083 eval(shift(@insns));
1084 eval(shift(@insns));
1085 eval(shift(@insns));
1086 eval(shift(@insns));
1087 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1088 eval(shift(@insns));
1089 eval(shift(@insns));
1090 eval(shift(@insns));
1091 eval(shift(@insns));
1092 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1093 foreach (@insns) { eval; } # remaining instructions
1094 &vmovdqa (16*$j."(%rsp)",$t2);
1097 for ($i=0,$j=0; $j<4; $j++) {
1098 &XOP_256_00_47($j,\&body_00_15,@X);
1099 push(@X,shift(@X)); # rotate(@X)
1101 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1102 &jne (".Lxop_00_47");
1104 for ($i=0; $i<16; ) {
1105 foreach(body_00_15()) { eval; }
1109 my @X = map("%xmm$_",(0..7));
1110 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1115 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1116 vmovdqu 0x00($inp),@X[0]
1117 lea $TABLE+0x80(%rip),$Tbl # size optimization
1118 vmovdqu 0x10($inp),@X[1]
1119 vmovdqu 0x20($inp),@X[2]
1120 vpshufb $t3,@X[0],@X[0]
1121 vmovdqu 0x30($inp),@X[3]
1122 vpshufb $t3,@X[1],@X[1]
1123 vmovdqu 0x40($inp),@X[4]
1124 vpshufb $t3,@X[2],@X[2]
1125 vmovdqu 0x50($inp),@X[5]
1126 vpshufb $t3,@X[3],@X[3]
1127 vmovdqu 0x60($inp),@X[6]
1128 vpshufb $t3,@X[4],@X[4]
1129 vmovdqu 0x70($inp),@X[7]
1130 vpshufb $t3,@X[5],@X[5]
1131 vpaddq -0x80($Tbl),@X[0],$t0
1132 vpshufb $t3,@X[6],@X[6]
1133 vpaddq -0x60($Tbl),@X[1],$t1
1134 vpshufb $t3,@X[7],@X[7]
1135 vpaddq -0x40($Tbl),@X[2],$t2
1136 vpaddq -0x20($Tbl),@X[3],$t3
1137 vmovdqa $t0,0x00(%rsp)
1138 vpaddq 0x00($Tbl),@X[4],$t0
1139 vmovdqa $t1,0x10(%rsp)
1140 vpaddq 0x20($Tbl),@X[5],$t1
1141 vmovdqa $t2,0x20(%rsp)
1142 vpaddq 0x40($Tbl),@X[6],$t2
1143 vmovdqa $t3,0x30(%rsp)
1144 vpaddq 0x60($Tbl),@X[7],$t3
1145 vmovdqa $t0,0x40(%rsp)
1147 vmovdqa $t1,0x50(%rsp)
1149 vmovdqa $t2,0x60(%rsp)
1151 vmovdqa $t3,0x70(%rsp)
1157 add \$`16*2*$SZ`,$Tbl
1159 sub XOP_512_00_47 () {
1163 my @insns = (&$body,&$body); # 52 instructions
1165 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
1166 eval(shift(@insns));
1167 eval(shift(@insns));
1168 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
1169 eval(shift(@insns));
1170 eval(shift(@insns));
1171 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
1172 eval(shift(@insns));
1173 eval(shift(@insns));
1174 &vpsrlq ($t0,$t0,$sigma0[2]);
1175 eval(shift(@insns));
1176 eval(shift(@insns));
1177 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
1178 eval(shift(@insns));
1179 eval(shift(@insns));
1180 eval(shift(@insns));
1181 eval(shift(@insns));
1182 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
1183 eval(shift(@insns));
1184 eval(shift(@insns));
1185 &vpxor ($t0,$t0,$t1);
1186 eval(shift(@insns));
1187 eval(shift(@insns));
1188 eval(shift(@insns));
1189 eval(shift(@insns));
1190 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
1191 eval(shift(@insns));
1192 eval(shift(@insns));
1193 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
1194 eval(shift(@insns));
1195 eval(shift(@insns));
1196 &vpsrlq ($t2,@X[7],$sigma1[2]);
1197 eval(shift(@insns));
1198 eval(shift(@insns));
1199 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
1200 eval(shift(@insns));
1201 eval(shift(@insns));
1202 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
1203 eval(shift(@insns));
1204 eval(shift(@insns));
1205 &vpxor ($t3,$t3,$t2);
1206 eval(shift(@insns));
1207 eval(shift(@insns));
1208 eval(shift(@insns));
1209 eval(shift(@insns));
1210 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1211 eval(shift(@insns));
1212 eval(shift(@insns));
1213 eval(shift(@insns));
1214 eval(shift(@insns));
1215 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1216 eval(shift(@insns));
1217 eval(shift(@insns));
1218 eval(shift(@insns));
1219 eval(shift(@insns));
1220 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1221 foreach (@insns) { eval; } # remaining instructions
1222 &vmovdqa (16*$j."(%rsp)",$t2);
1225 for ($i=0,$j=0; $j<8; $j++) {
1226 &XOP_512_00_47($j,\&body_00_15,@X);
1227 push(@X,shift(@X)); # rotate(@X)
1229 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1230 &jne (".Lxop_00_47");
1232 for ($i=0; $i<16; ) {
1233 foreach(body_00_15()) { eval; }
1241 lea 16*$SZ($inp),$inp
1265 $code.=<<___ if ($win64);
1266 movaps 16*$SZ+32(%rsp),%xmm6
1267 movaps 16*$SZ+48(%rsp),%xmm7
1268 movaps 16*$SZ+64(%rsp),%xmm8
1269 movaps 16*$SZ+80(%rsp),%xmm9
1271 $code.=<<___ if ($win64 && $SZ>4);
1272 movaps 16*$SZ+96(%rsp),%xmm10
1273 movaps 16*$SZ+112(%rsp),%xmm11
1285 .size ${func}_xop,.-${func}_xop
1288 ######################################################################
1289 # AVX+shrd code path
1291 local *ror = sub { &shrd(@_[0],@_) };
1294 .type ${func}_avx,\@function,3
1304 mov %rsp,%r11 # copy %rsp
1305 shl \$4,%rdx # num*16
1306 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1307 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1308 and \$-64,%rsp # align stack frame
1309 mov $ctx,$_ctx # save ctx, 1st arg
1310 mov $inp,$_inp # save inp, 2nd arh
1311 mov %rdx,$_end # save end pointer, "3rd" arg
1312 mov %r11,$_rsp # save copy of %rsp
1314 $code.=<<___ if ($win64);
1315 movaps %xmm6,16*$SZ+32(%rsp)
1316 movaps %xmm7,16*$SZ+48(%rsp)
1317 movaps %xmm8,16*$SZ+64(%rsp)
1318 movaps %xmm9,16*$SZ+80(%rsp)
1320 $code.=<<___ if ($win64 && $SZ>4);
1321 movaps %xmm10,16*$SZ+96(%rsp)
1322 movaps %xmm11,16*$SZ+112(%rsp)
1337 if ($SZ==4) { # SHA256
1338 my @X = map("%xmm$_",(0..3));
1339 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1342 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1343 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1347 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1348 vmovdqu 0x00($inp),@X[0]
1349 vmovdqu 0x10($inp),@X[1]
1350 vmovdqu 0x20($inp),@X[2]
1351 vmovdqu 0x30($inp),@X[3]
1352 vpshufb $t3,@X[0],@X[0]
1353 lea $TABLE(%rip),$Tbl
1354 vpshufb $t3,@X[1],@X[1]
1355 vpshufb $t3,@X[2],@X[2]
1356 vpaddd 0x00($Tbl),@X[0],$t0
1357 vpshufb $t3,@X[3],@X[3]
1358 vpaddd 0x20($Tbl),@X[1],$t1
1359 vpaddd 0x40($Tbl),@X[2],$t2
1360 vpaddd 0x60($Tbl),@X[3],$t3
1361 vmovdqa $t0,0x00(%rsp)
1363 vmovdqa $t1,0x10(%rsp)
1365 vmovdqa $t2,0x20(%rsp)
1367 vmovdqa $t3,0x30(%rsp)
1373 sub \$`-16*2*$SZ`,$Tbl # size optimization
1375 sub Xupdate_256_AVX () {
1377 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
1378 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
1379 '&vpsrld ($t2,$t0,$sigma0[0]);',
1380 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
1381 '&vpsrld ($t3,$t0,$sigma0[2])',
1382 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
1383 '&vpxor ($t0,$t3,$t2)',
1384 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
1385 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1386 '&vpxor ($t0,$t0,$t1)',
1387 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1388 '&vpxor ($t0,$t0,$t2)',
1389 '&vpsrld ($t2,$t3,$sigma1[2]);',
1390 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
1391 '&vpsrlq ($t3,$t3,$sigma1[0]);',
1392 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
1393 '&vpxor ($t2,$t2,$t3);',
1394 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1395 '&vpxor ($t2,$t2,$t3)',
1396 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
1397 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
1398 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
1399 '&vpsrld ($t2,$t3,$sigma1[2])',
1400 '&vpsrlq ($t3,$t3,$sigma1[0])',
1401 '&vpxor ($t2,$t2,$t3);',
1402 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1403 '&vpxor ($t2,$t2,$t3)',
1404 '&vpshufb ($t2,$t2,$t5)',
1405 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
1409 sub AVX_256_00_47 () {
1413 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1415 foreach (Xupdate_256_AVX()) { # 29 instructions
1417 eval(shift(@insns));
1418 eval(shift(@insns));
1419 eval(shift(@insns));
1421 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1422 foreach (@insns) { eval; } # remaining instructions
1423 &vmovdqa (16*$j."(%rsp)",$t2);
1426 for ($i=0,$j=0; $j<4; $j++) {
1427 &AVX_256_00_47($j,\&body_00_15,@X);
1428 push(@X,shift(@X)); # rotate(@X)
1430 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1431 &jne (".Lavx_00_47");
1433 for ($i=0; $i<16; ) {
1434 foreach(body_00_15()) { eval; }
1438 my @X = map("%xmm$_",(0..7));
1439 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1445 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1446 vmovdqu 0x00($inp),@X[0]
1447 lea $TABLE+0x80(%rip),$Tbl # size optimization
1448 vmovdqu 0x10($inp),@X[1]
1449 vmovdqu 0x20($inp),@X[2]
1450 vpshufb $t3,@X[0],@X[0]
1451 vmovdqu 0x30($inp),@X[3]
1452 vpshufb $t3,@X[1],@X[1]
1453 vmovdqu 0x40($inp),@X[4]
1454 vpshufb $t3,@X[2],@X[2]
1455 vmovdqu 0x50($inp),@X[5]
1456 vpshufb $t3,@X[3],@X[3]
1457 vmovdqu 0x60($inp),@X[6]
1458 vpshufb $t3,@X[4],@X[4]
1459 vmovdqu 0x70($inp),@X[7]
1460 vpshufb $t3,@X[5],@X[5]
1461 vpaddq -0x80($Tbl),@X[0],$t0
1462 vpshufb $t3,@X[6],@X[6]
1463 vpaddq -0x60($Tbl),@X[1],$t1
1464 vpshufb $t3,@X[7],@X[7]
1465 vpaddq -0x40($Tbl),@X[2],$t2
1466 vpaddq -0x20($Tbl),@X[3],$t3
1467 vmovdqa $t0,0x00(%rsp)
1468 vpaddq 0x00($Tbl),@X[4],$t0
1469 vmovdqa $t1,0x10(%rsp)
1470 vpaddq 0x20($Tbl),@X[5],$t1
1471 vmovdqa $t2,0x20(%rsp)
1472 vpaddq 0x40($Tbl),@X[6],$t2
1473 vmovdqa $t3,0x30(%rsp)
1474 vpaddq 0x60($Tbl),@X[7],$t3
1475 vmovdqa $t0,0x40(%rsp)
1477 vmovdqa $t1,0x50(%rsp)
1479 vmovdqa $t2,0x60(%rsp)
1481 vmovdqa $t3,0x70(%rsp)
1487 add \$`16*2*$SZ`,$Tbl
1489 sub Xupdate_512_AVX () {
1491 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
1492 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
1493 '&vpsrlq ($t2,$t0,$sigma0[0])',
1494 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
1495 '&vpsrlq ($t3,$t0,$sigma0[2])',
1496 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
1497 '&vpxor ($t0,$t3,$t2)',
1498 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1499 '&vpxor ($t0,$t0,$t1)',
1500 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1501 '&vpxor ($t0,$t0,$t2)',
1502 '&vpsrlq ($t3,@X[7],$sigma1[2]);',
1503 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
1504 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
1505 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
1506 '&vpsrlq ($t1,@X[7],$sigma1[0]);',
1507 '&vpxor ($t3,$t3,$t2)',
1508 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
1509 '&vpxor ($t3,$t3,$t1)',
1510 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
1511 '&vpxor ($t3,$t3,$t2)',
1512 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
1513 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
1517 sub AVX_512_00_47 () {
1521 my @insns = (&$body,&$body); # 52 instructions
1523 foreach (Xupdate_512_AVX()) { # 23 instructions
1525 eval(shift(@insns));
1526 eval(shift(@insns));
1528 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1529 foreach (@insns) { eval; } # remaining instructions
1530 &vmovdqa (16*$j."(%rsp)",$t2);
1533 for ($i=0,$j=0; $j<8; $j++) {
1534 &AVX_512_00_47($j,\&body_00_15,@X);
1535 push(@X,shift(@X)); # rotate(@X)
1537 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1538 &jne (".Lavx_00_47");
1540 for ($i=0; $i<16; ) {
1541 foreach(body_00_15()) { eval; }
1549 lea 16*$SZ($inp),$inp
1573 $code.=<<___ if ($win64);
1574 movaps 16*$SZ+32(%rsp),%xmm6
1575 movaps 16*$SZ+48(%rsp),%xmm7
1576 movaps 16*$SZ+64(%rsp),%xmm8
1577 movaps 16*$SZ+80(%rsp),%xmm9
1579 $code.=<<___ if ($win64 && $SZ>4);
1580 movaps 16*$SZ+96(%rsp),%xmm10
1581 movaps 16*$SZ+112(%rsp),%xmm11
1593 .size ${func}_avx,.-${func}_avx
1597 ######################################################################
1598 # AVX2+BMI code path
1600 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
1604 sub bodyx_00_15 () {
1605 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1607 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1609 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
1610 '&and ($a4,$e)', # f&e
1611 '&rorx ($a0,$e,$Sigma1[2])',
1612 '&rorx ($a2,$e,$Sigma1[1])',
1614 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
1615 '&lea ($h,"($h,$a4)")',
1616 '&andn ($a4,$e,$g)', # ~e&g
1619 '&rorx ($a1,$e,$Sigma1[0])',
1620 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
1621 '&xor ($a0,$a1)', # Sigma1(e)
1624 '&rorx ($a4,$a,$Sigma0[2])',
1625 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
1626 '&xor ($a2,$b)', # a^b, b^c in next round
1627 '&rorx ($a1,$a,$Sigma0[1])',
1629 '&rorx ($a0,$a,$Sigma0[0])',
1630 '&lea ($d,"($d,$h)")', # d+=h
1631 '&and ($a3,$a2)', # (b^c)&(a^b)
1634 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
1635 '&xor ($a1,$a0)', # Sigma0(a)
1636 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
1637 '&mov ($a4,$e)', # copy of f in future
1639 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1641 # and at the finish one has to $a+=$a1
1645 .type ${func}_avx2,\@function,3
1655 mov %rsp,%r11 # copy %rsp
1656 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1657 shl \$4,%rdx # num*16
1658 and \$-256*$SZ,%rsp # align stack frame
1659 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1660 add \$`2*$SZ*($rounds-8)`,%rsp
1661 mov $ctx,$_ctx # save ctx, 1st arg
1662 mov $inp,$_inp # save inp, 2nd arh
1663 mov %rdx,$_end # save end pointer, "3rd" arg
1664 mov %r11,$_rsp # save copy of %rsp
1666 $code.=<<___ if ($win64);
1667 movaps %xmm6,16*$SZ+32(%rsp)
1668 movaps %xmm7,16*$SZ+48(%rsp)
1669 movaps %xmm8,16*$SZ+64(%rsp)
1670 movaps %xmm9,16*$SZ+80(%rsp)
1672 $code.=<<___ if ($win64 && $SZ>4);
1673 movaps %xmm10,16*$SZ+96(%rsp)
1674 movaps %xmm11,16*$SZ+112(%rsp)
1680 sub \$-16*$SZ,$inp # inp++, size optimization
1682 mov $inp,%r12 # borrow $T1
1684 cmp %rdx,$inp # $_end
1686 cmove %rsp,%r12 # next block or random data
1693 if ($SZ==4) { # SHA256
1694 my @X = map("%ymm$_",(0..3));
1695 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1698 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1699 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1703 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1704 vmovdqu -16*$SZ+0($inp),%xmm0
1705 vmovdqu -16*$SZ+16($inp),%xmm1
1706 vmovdqu -16*$SZ+32($inp),%xmm2
1707 vmovdqu -16*$SZ+48($inp),%xmm3
1708 #mov $inp,$_inp # offload $inp
1709 vinserti128 \$1,(%r12),@X[0],@X[0]
1710 vinserti128 \$1,16(%r12),@X[1],@X[1]
1711 vpshufb $t3,@X[0],@X[0]
1712 vinserti128 \$1,32(%r12),@X[2],@X[2]
1713 vpshufb $t3,@X[1],@X[1]
1714 vinserti128 \$1,48(%r12),@X[3],@X[3]
1716 lea $TABLE(%rip),$Tbl
1717 vpshufb $t3,@X[2],@X[2]
1718 vpaddd 0x00($Tbl),@X[0],$t0
1719 vpshufb $t3,@X[3],@X[3]
1720 vpaddd 0x20($Tbl),@X[1],$t1
1721 vpaddd 0x40($Tbl),@X[2],$t2
1722 vpaddd 0x60($Tbl),@X[3],$t3
1723 vmovdqa $t0,0x00(%rsp)
1725 vmovdqa $t1,0x20(%rsp)
1726 lea -$PUSH8(%rsp),%rsp
1728 vmovdqa $t2,0x00(%rsp)
1730 vmovdqa $t3,0x20(%rsp)
1732 sub \$-16*2*$SZ,$Tbl # size optimization
1739 sub AVX2_256_00_47 () {
1743 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1744 my $base = "+2*$PUSH8(%rsp)";
1746 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1747 foreach (Xupdate_256_AVX()) { # 29 instructions
1749 eval(shift(@insns));
1750 eval(shift(@insns));
1751 eval(shift(@insns));
1753 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1754 foreach (@insns) { eval; } # remaining instructions
1755 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1758 for ($i=0,$j=0; $j<4; $j++) {
1759 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1760 push(@X,shift(@X)); # rotate(@X)
1762 &lea ($Tbl,16*2*$SZ."($Tbl)");
1763 &cmpb (($SZ-1)."($Tbl)",0);
1764 &jne (".Lavx2_00_47");
1766 for ($i=0; $i<16; ) {
1767 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1768 foreach(bodyx_00_15()) { eval; }
1771 my @X = map("%ymm$_",(0..7));
1772 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
1778 vmovdqu -16*$SZ($inp),%xmm0
1779 vmovdqu -16*$SZ+16($inp),%xmm1
1780 vmovdqu -16*$SZ+32($inp),%xmm2
1781 lea $TABLE+0x80(%rip),$Tbl # size optimization
1782 vmovdqu -16*$SZ+48($inp),%xmm3
1783 vmovdqu -16*$SZ+64($inp),%xmm4
1784 vmovdqu -16*$SZ+80($inp),%xmm5
1785 vmovdqu -16*$SZ+96($inp),%xmm6
1786 vmovdqu -16*$SZ+112($inp),%xmm7
1787 #mov $inp,$_inp # offload $inp
1788 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
1789 vinserti128 \$1,(%r12),@X[0],@X[0]
1790 vinserti128 \$1,16(%r12),@X[1],@X[1]
1791 vpshufb $t2,@X[0],@X[0]
1792 vinserti128 \$1,32(%r12),@X[2],@X[2]
1793 vpshufb $t2,@X[1],@X[1]
1794 vinserti128 \$1,48(%r12),@X[3],@X[3]
1795 vpshufb $t2,@X[2],@X[2]
1796 vinserti128 \$1,64(%r12),@X[4],@X[4]
1797 vpshufb $t2,@X[3],@X[3]
1798 vinserti128 \$1,80(%r12),@X[5],@X[5]
1799 vpshufb $t2,@X[4],@X[4]
1800 vinserti128 \$1,96(%r12),@X[6],@X[6]
1801 vpshufb $t2,@X[5],@X[5]
1802 vinserti128 \$1,112(%r12),@X[7],@X[7]
1804 vpaddq -0x80($Tbl),@X[0],$t0
1805 vpshufb $t2,@X[6],@X[6]
1806 vpaddq -0x60($Tbl),@X[1],$t1
1807 vpshufb $t2,@X[7],@X[7]
1808 vpaddq -0x40($Tbl),@X[2],$t2
1809 vpaddq -0x20($Tbl),@X[3],$t3
1810 vmovdqa $t0,0x00(%rsp)
1811 vpaddq 0x00($Tbl),@X[4],$t0
1812 vmovdqa $t1,0x20(%rsp)
1813 vpaddq 0x20($Tbl),@X[5],$t1
1814 vmovdqa $t2,0x40(%rsp)
1815 vpaddq 0x40($Tbl),@X[6],$t2
1816 vmovdqa $t3,0x60(%rsp)
1817 lea -$PUSH8(%rsp),%rsp
1818 vpaddq 0x60($Tbl),@X[7],$t3
1819 vmovdqa $t0,0x00(%rsp)
1821 vmovdqa $t1,0x20(%rsp)
1823 vmovdqa $t2,0x40(%rsp)
1825 vmovdqa $t3,0x60(%rsp)
1834 sub AVX2_512_00_47 () {
1838 my @insns = (&$body,&$body); # 48 instructions
1839 my $base = "+2*$PUSH8(%rsp)";
1841 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0);
1842 foreach (Xupdate_512_AVX()) { # 23 instructions
1845 eval(shift(@insns));
1846 eval(shift(@insns));
1847 eval(shift(@insns));
1850 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1851 foreach (@insns) { eval; } # remaining instructions
1852 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1855 for ($i=0,$j=0; $j<8; $j++) {
1856 &AVX2_512_00_47($j,\&bodyx_00_15,@X);
1857 push(@X,shift(@X)); # rotate(@X)
1859 &lea ($Tbl,16*2*$SZ."($Tbl)");
1860 &cmpb (($SZ-1-0x80)."($Tbl)",0);
1861 &jne (".Lavx2_00_47");
1863 for ($i=0; $i<16; ) {
1864 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1865 foreach(bodyx_00_15()) { eval; }
1869 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
1871 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
1872 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1892 cmp `$PUSH8+2*8`($Tbl),$inp # $_end
1903 for ($i=0; $i<8; ) {
1904 my $base="+16($Tbl)";
1905 foreach(bodyx_00_15()) { eval; }
1908 lea -$PUSH8($Tbl),$Tbl
1912 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
1914 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
1915 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1923 lea `2*16*$SZ`($inp),$inp # inp+=2
1930 cmove %rsp,%r12 # next block or stale data
1947 $code.=<<___ if ($win64);
1948 movaps 16*$SZ+32(%rsp),%xmm6
1949 movaps 16*$SZ+48(%rsp),%xmm7
1950 movaps 16*$SZ+64(%rsp),%xmm8
1951 movaps 16*$SZ+80(%rsp),%xmm9
1953 $code.=<<___ if ($win64 && $SZ>4);
1954 movaps 16*$SZ+96(%rsp),%xmm10
1955 movaps 16*$SZ+112(%rsp),%xmm11
1967 .size ${func}_avx2,.-${func}_avx2
1972 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1973 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1981 .extern __imp_RtlVirtualUnwind
1982 .type se_handler,\@abi-omnipotent
1996 mov 120($context),%rax # pull context->Rax
1997 mov 248($context),%rbx # pull context->Rip
1999 mov 8($disp),%rsi # disp->ImageBase
2000 mov 56($disp),%r11 # disp->HanderlData
2002 mov 0(%r11),%r10d # HandlerData[0]
2003 lea (%rsi,%r10),%r10 # prologue label
2004 cmp %r10,%rbx # context->Rip<prologue label
2007 mov 152($context),%rax # pull context->Rsp
2009 mov 4(%r11),%r10d # HandlerData[1]
2010 lea (%rsi,%r10),%r10 # epilogue label
2011 cmp %r10,%rbx # context->Rip>=epilogue label
2014 $code.=<<___ if ($avx>1);
2015 lea .Lavx2_shortcut(%rip),%r10
2016 cmp %r10,%rbx # context->Rip<avx2_shortcut
2020 add \$`2*$SZ*($rounds-8)`,%rax
2024 mov %rax,%rsi # put aside Rsp
2025 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
2034 mov %rbx,144($context) # restore context->Rbx
2035 mov %rbp,160($context) # restore context->Rbp
2036 mov %r12,216($context) # restore context->R12
2037 mov %r13,224($context) # restore context->R13
2038 mov %r14,232($context) # restore context->R14
2039 mov %r15,240($context) # restore context->R15
2041 lea .Lepilogue(%rip),%r10
2043 jb .Lin_prologue # non-AVX code
2045 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
2046 lea 512($context),%rdi # &context.Xmm6
2047 mov \$`$SZ==4?8:12`,%ecx
2048 .long 0xa548f3fc # cld; rep movsq
2053 mov %rax,152($context) # restore context->Rsp
2054 mov %rsi,168($context) # restore context->Rsi
2055 mov %rdi,176($context) # restore context->Rdi
2057 mov 40($disp),%rdi # disp->ContextRecord
2058 mov $context,%rsi # context
2059 mov \$154,%ecx # sizeof(CONTEXT)
2060 .long 0xa548f3fc # cld; rep movsq
2063 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2064 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2065 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2066 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2067 mov 40(%rsi),%r10 # disp->ContextRecord
2068 lea 56(%rsi),%r11 # &disp->HandlerData
2069 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2070 mov %r10,32(%rsp) # arg5
2071 mov %r11,40(%rsp) # arg6
2072 mov %r12,48(%rsp) # arg7
2073 mov %rcx,56(%rsp) # arg8, (NULL)
2074 call *__imp_RtlVirtualUnwind(%rip)
2076 mov \$1,%eax # ExceptionContinueSearch
2088 .size se_handler,.-se_handler
2092 .rva .LSEH_begin_$func
2093 .rva .LSEH_end_$func
2094 .rva .LSEH_info_$func
2096 $code.=<<___ if ($SZ==4);
2097 .rva .LSEH_begin_${func}_ssse3
2098 .rva .LSEH_end_${func}_ssse3
2099 .rva .LSEH_info_${func}_ssse3
2101 $code.=<<___ if ($avx && $SZ==8);
2102 .rva .LSEH_begin_${func}_xop
2103 .rva .LSEH_end_${func}_xop
2104 .rva .LSEH_info_${func}_xop
2106 $code.=<<___ if ($avx);
2107 .rva .LSEH_begin_${func}_avx
2108 .rva .LSEH_end_${func}_avx
2109 .rva .LSEH_info_${func}_avx
2111 $code.=<<___ if ($avx>1);
2112 .rva .LSEH_begin_${func}_avx2
2113 .rva .LSEH_end_${func}_avx2
2114 .rva .LSEH_info_${func}_avx2
2122 .rva .Lprologue,.Lepilogue # HandlerData[]
2124 $code.=<<___ if ($SZ==4);
2125 .LSEH_info_${func}_ssse3:
2128 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
2130 $code.=<<___ if ($avx && $SZ==8);
2131 .LSEH_info_${func}_xop:
2134 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
2136 $code.=<<___ if ($avx);
2137 .LSEH_info_${func}_avx:
2140 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
2142 $code.=<<___ if ($avx>1);
2143 .LSEH_info_${func}_avx2:
2146 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2150 $code =~ s/\`([^\`]*)\`/eval $1/gem;