3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. Rights for redistribution and usage in source and binary
6 # forms are granted according to the OpenSSL license.
7 # ====================================================================
9 # sha256/512_block procedure for x86_64.
11 # 40% improvement over compiler-generated code on Opteron. On EM64T
12 # sha256 was observed to run >80% faster and sha512 - >40%. No magical
13 # tricks, just straight implementation... I really wonder why gcc
14 # [being armed with inline assembler] fails to generate as fast code.
15 # The only thing which is cool about this module is that it's very
16 # same instruction sequence used for both SHA-256 and SHA-512. In
17 # former case the instructions operate on 32-bit operands, while in
18 # latter - on 64-bit ones. All I had to do is to get one flavor right,
19 # the other one passed the test right away:-)
21 # sha256_block runs in ~1005 cycles on Opteron, which gives you
22 # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
23 # frequency in GHz. sha512_block runs in ~1275 cycles, which results
24 # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
25 # Well, if you compare it to IA-64 implementation, which maintains
26 # X[16] in register bank[!], tends to 4 instructions per CPU clock
27 # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
28 # issue Opteron pipeline and X[16] maintained in memory. So that *if*
29 # there is a way to improve it, *then* the only way would be to try to
30 # offload X[16] updates to SSE unit, but that would require "deeper"
31 # loop unroll, which in turn would naturally cause size blow-up, not
32 # to mention increased complexity! And once again, only *if* it's
33 # actually possible to noticeably improve overall ILP, instruction
34 # level parallelism, on a given CPU implementation in this case.
36 # Special note on Intel EM64T. While Opteron CPU exhibits perfect
37 # perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
38 # [currently available] EM64T CPUs apparently are far from it. On the
39 # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
40 # sha256_block:-( This is presumably because 64-bit shifts/rotates
41 # apparently are not atomic instructions, but implemented in microcode.
45 # Optimization including one of Pavel Semjanov's ideas, alternative
46 # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
47 # unfortunately -10% SHA512 on P4 [which nobody should care about
52 # Add SIMD code paths, see below for improvement coefficients. SSSE3
53 # code path was not attempted for SHA512, because improvement is not
54 # estimated to be high enough, noticeably less than 9%, to justify
55 # the effort, not on pre-AVX processors. [Obviously with exclusion
56 # for VIA Nano, but it has SHA512 instruction that is faster and
57 # should be used instead.] For reference, corresponding estimated
58 # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
59 # higher coefficients are observed on VIA Nano and Bulldozer has more
60 # to do with specifics of their architecture [which is topic for
61 # separate discussion].
65 # Add AVX2 code path. Two consecutive input blocks are loaded to
66 # 256-bit %ymm registers, with data from first block to least
67 # significant 128-bit halves and data from second to most significant.
68 # The data is then processed with same SIMD instruction sequence as
69 # for AVX, but with %ymm as operands. Side effect is increased stack
70 # frame, 448 additional bytes in SHA256 and 1152 in SHA512.
72 ######################################################################
73 # Current performance in cycles per processed byte (less is better):
75 # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
77 # AMD K8 15.1 - - 9.70 -
79 # Core 2 15.5 13.9(+11%) - 10.3 -
80 # Westmere 15.1 12.5(+21%) - 9.72 -
81 # Sandy Bridge 17.4 14.0(+24%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
82 # Ivy Bridge 12.6 10.3(+22%) 10.3(+22%) 8.17 7.22(+13%)
83 # Bulldozer 21.5 13.7(+57%) 13.7(+57%(***)) 13.5 8.58(+57%)
84 # VIA Nano 23.0 16.3(+41%) - 14.7 -
85 # Atom 23.0 21.6(+6%) - 14.7 -
87 # (*) whichever best applicable;
88 # (**) switch from ror to shrd stands for fair share of improvement;
89 # (***) execution time is fully determined by remaining integer-only
90 # part, body_00_15; reducing the amount of SIMD instructions
91 # below certain limit makes no difference/sense; to conserve
92 # space SHA256 XOP code path is therefore omitted;
96 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
98 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
100 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
101 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
102 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
103 die "can't locate x86_64-xlate.pl";
105 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
106 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
107 $avx = ($1>=2.19) + ($1>=2.22);
110 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
111 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
112 $avx = ($1>=2.09) + ($1>=2.10);
115 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
116 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
117 $avx = ($1>=10) + ($1>=11);
120 open OUT,"| \"$^X\" $xlate $flavour $output";
123 if ($output =~ /512/) {
124 $func="sha512_block_data_order";
127 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
128 "%r8", "%r9", "%r10","%r11");
129 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
136 $func="sha256_block_data_order";
139 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
140 "%r8d","%r9d","%r10d","%r11d");
141 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
149 $ctx="%rdi"; # 1st arg, zapped by $a3
150 $inp="%rsi"; # 2nd arg
153 $_ctx="16*$SZ+0*8(%rsp)";
154 $_inp="16*$SZ+1*8(%rsp)";
155 $_end="16*$SZ+2*8(%rsp)";
156 $_rsp="16*$SZ+3*8(%rsp)";
157 $framesz="16*$SZ+4*8";
161 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
163 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
166 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
169 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
173 mov $T1,`$SZ*($i&0xf)`(%rsp)
177 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
179 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
181 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
183 add $a2,$T1 # T1+=Ch(e,f,g)
186 add ($Tbl),$T1 # T1+=K[round]
189 ror \$$Sigma1[0],$a0 # Sigma1(e)
190 xor $b,$a2 # a^b, b^c in next round
193 ror \$$Sigma0[0],$a1 # Sigma0(a)
195 add $a0,$T1 # T1+=Sigma1(e)
197 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
201 $code.=<<___ if ($i>=15);
202 mov `$SZ*(($i+2)&0xf)`(%rsp),$a0
205 lea $STRIDE($Tbl),$Tbl # round++
206 add $a1,$h # h+=Sigma0(a)
209 ($a2,$a3) = ($a3,$a2);
213 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
216 #mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
217 mov `$SZ*(($i+14)&0xf)`(%rsp),$a1
220 ror \$`$sigma0[1]-$sigma0[0]`,$a0
222 ror \$`$sigma1[1]-$sigma1[0]`,$a1
230 xor $a0,$T1 # sigma0(X[(i+1)&0xf])
232 add `$SZ*(($i+9)&0xf)`(%rsp),$T1
233 xor $a2,$a1 # sigma1(X[(i+14)&0xf])
235 add `$SZ*($i&0xf)`(%rsp),$T1
246 .extern OPENSSL_ia32cap_P
248 .type $func,\@function,3
252 $code.=<<___ if ($SZ==4 || $avx);
253 lea OPENSSL_ia32cap_P(%rip),%r11
258 $code.=<<___ if ($avx && $SZ==8);
259 test \$`1<<11`,%r10d # check for XOP
262 $code.=<<___ if ($avx>1);
263 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
264 cmp \$`1<<8|1<<5|1<<3`,%r11d
267 $code.=<<___ if ($avx);
268 and \$`1<<30`,%r9d # mask "Intel CPU" bit
269 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
271 cmp \$`1<<28|1<<9|1<<30`,%r10d
274 $code.=<<___ if ($SZ==4);
285 mov %rsp,%r11 # copy %rsp
286 shl \$4,%rdx # num*16
288 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
289 and \$-64,%rsp # align stack frame
290 mov $ctx,$_ctx # save ctx, 1st arg
291 mov $inp,$_inp # save inp, 2nd arh
292 mov %rdx,$_end # save end pointer, "3rd" arg
293 mov %r11,$_rsp # save copy of %rsp
309 lea $TABLE(%rip),$Tbl
312 for($i=0;$i<16;$i++) {
313 $code.=" mov $SZ*$i($inp),$T1\n";
314 $code.=" mov @ROT[4],$a0\n";
315 $code.=" mov @ROT[0],$a1\n";
316 $code.=" bswap $T1\n";
317 &ROUND_00_15($i,@ROT);
318 unshift(@ROT,pop(@ROT));
326 &ROUND_16_XX($i,@ROT);
327 unshift(@ROT,pop(@ROT));
331 cmpb \$0,`$SZ-1`($Tbl)
335 lea 16*$SZ($inp),$inp
374 .type $TABLE,\@object
376 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
377 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
378 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
379 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
380 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
381 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
382 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
383 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
384 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
385 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
386 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
387 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
388 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
389 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
390 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
391 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
392 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
393 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
394 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
395 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
396 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
397 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
398 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
399 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
400 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
401 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
402 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
403 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
404 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
405 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
406 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
407 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
409 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
410 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
411 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
412 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
413 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
414 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
415 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
420 .type $TABLE,\@object
422 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
423 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
424 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
425 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
426 .quad 0x3956c25bf348b538,0x59f111f1b605d019
427 .quad 0x3956c25bf348b538,0x59f111f1b605d019
428 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
429 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
430 .quad 0xd807aa98a3030242,0x12835b0145706fbe
431 .quad 0xd807aa98a3030242,0x12835b0145706fbe
432 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
433 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
434 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
435 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
436 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
437 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
438 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
439 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
440 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
441 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
442 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
443 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
444 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
445 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
446 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
447 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
448 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
449 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
450 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
451 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
452 .quad 0x06ca6351e003826f,0x142929670a0e6e70
453 .quad 0x06ca6351e003826f,0x142929670a0e6e70
454 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
455 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
456 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
457 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
458 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
459 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
460 .quad 0x81c2c92e47edaee6,0x92722c851482353b
461 .quad 0x81c2c92e47edaee6,0x92722c851482353b
462 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
463 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
464 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
465 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
466 .quad 0xd192e819d6ef5218,0xd69906245565a910
467 .quad 0xd192e819d6ef5218,0xd69906245565a910
468 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
469 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
470 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
471 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
472 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
473 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
474 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
475 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
476 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
477 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
478 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
479 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
480 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
481 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
482 .quad 0x90befffa23631e28,0xa4506cebde82bde9
483 .quad 0x90befffa23631e28,0xa4506cebde82bde9
484 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
485 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
486 .quad 0xca273eceea26619c,0xd186b8c721c0c207
487 .quad 0xca273eceea26619c,0xd186b8c721c0c207
488 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
489 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
490 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
491 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
492 .quad 0x113f9804bef90dae,0x1b710b35131c471b
493 .quad 0x113f9804bef90dae,0x1b710b35131c471b
494 .quad 0x28db77f523047d84,0x32caab7b40c72493
495 .quad 0x28db77f523047d84,0x32caab7b40c72493
496 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
497 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
498 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
499 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
500 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
501 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
503 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
504 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
505 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
509 ######################################################################
515 my ($a,$b,$c,$d,$e,$f,$g,$h);
517 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
518 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
520 $arg = "\$$arg" if ($arg*1 eq $arg);
521 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
526 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
528 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
533 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
534 '&xor ($a4,$g)', # f^g
536 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
538 '&and ($a4,$e)', # (f^g)&e
541 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
544 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
545 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
546 '&xor ($a2,$b)', # a^b, b^c in next round
548 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
549 '&add ($h,$a4)', # h+=Ch(e,f,g)
550 '&and ($a3,$a2)', # (b^c)&(a^b)
553 '&add ($h,$a0)', # h+=Sigma1(e)
554 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
556 '&add ($d,$h)', # d+=h
557 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
558 '&add ($h,$a3)', # h+=Maj(a,b,c)
561 '&add ($a1,$h);'. # h+=Sigma0(a)
562 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
566 ######################################################################
569 if ($SZ==4) { # SHA256 only
570 my @X = map("%xmm$_",(0..3));
571 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
574 .type ${func}_ssse3,\@function,3
584 mov %rsp,%r11 # copy %rsp
585 shl \$4,%rdx # num*16
586 sub \$`$framesz+$win64*16*4`,%rsp
587 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
588 and \$-64,%rsp # align stack frame
589 mov $ctx,$_ctx # save ctx, 1st arg
590 mov $inp,$_inp # save inp, 2nd arh
591 mov %rdx,$_end # save end pointer, "3rd" arg
592 mov %r11,$_rsp # save copy of %rsp
594 $code.=<<___ if ($win64);
595 movaps %xmm6,16*$SZ+32(%rsp)
596 movaps %xmm7,16*$SZ+48(%rsp)
597 movaps %xmm8,16*$SZ+64(%rsp)
598 movaps %xmm9,16*$SZ+80(%rsp)
614 movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
615 movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
619 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
620 movdqu 0x00($inp),@X[0]
621 movdqu 0x10($inp),@X[1]
622 movdqu 0x20($inp),@X[2]
623 movdqu 0x30($inp),@X[3]
625 lea $TABLE(%rip),$Tbl
627 movdqa 0x00($Tbl),$t0
629 movdqa 0x20($Tbl),$t1
631 movdqa 0x40($Tbl),$t2
633 movdqa 0x60($Tbl),$t3
637 movdqa $t0,0x00(%rsp)
639 movdqa $t1,0x10(%rsp)
641 movdqa $t2,0x20(%rsp)
643 movdqa $t3,0x30(%rsp)
649 sub \$-16*2*$SZ,$Tbl # size optimization
651 sub Xupdate_256_SSSE3 () {
653 '&movdqa ($t0,@X[1]);',
654 '&movdqa ($t3,@X[3])',
655 '&palignr ($t0,@X[0],$SZ)', # X[1..4]
656 '&palignr ($t3,@X[2],$SZ);', # X[9..12]
658 '&movdqa ($t2,$t0);',
659 '&psrld ($t0,$sigma0[2])',
660 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
661 '&psrld ($t2,$sigma0[0])',
662 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
663 '&pslld ($t1,8*$SZ-$sigma0[1]);'.
665 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
667 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
670 '&pxor ($t0,$t1);', # sigma0(X[1..4])
671 '&psrld ($t3,$sigma1[2])',
672 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
673 '&psrlq ($t2,$sigma1[0])',
675 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
677 '&pshufb ($t3,$t4)', # sigma1(X[14..15])
678 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
679 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
680 '&movdqa ($t2,$t3);',
681 '&psrld ($t3,$sigma1[2])',
682 '&psrlq ($t2,$sigma1[0])',
684 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
686 '&movdqa ($t2,16*2*$j."($Tbl)")',
688 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
692 sub SSSE3_256_00_47 () {
696 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
699 foreach (Xupdate_256_SSSE3()) { # 36 instructions
705 } else { # squeeze extra 3% on Westmere and Atom
706 eval(shift(@insns)); #@
712 eval(shift(@insns)); #@
714 &palignr ($t0,@X[0],$SZ); # X[1..4]
715 eval(shift(@insns)); #@
717 &palignr ($t3,@X[2],$SZ); # X[9..12]
722 eval(shift(@insns)); #@
727 eval(shift(@insns)); #@
730 &psrld ($t0,$sigma0[2]);
734 &paddd (@X[0],$t3); # X[0..3] += X[9..12]
736 eval(shift(@insns)); #@
738 &psrld ($t2,$sigma0[0]);
741 eval(shift(@insns)); #@
743 &pshufd ($t3,@X[3],0b11111010); # X[4..15]
745 &pslld ($t1,8*$SZ-$sigma0[1]);
748 eval(shift(@insns)); #@
750 &psrld ($t2,$sigma0[1]-$sigma0[0]);
751 eval(shift(@insns)); #@
756 &pslld ($t1,$sigma0[1]-$sigma0[0]);
760 eval(shift(@insns)); #@
764 eval(shift(@insns)); #@
766 &pxor ($t0,$t1); # sigma0(X[1..4])
769 &psrld ($t3,$sigma1[2]);
772 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
774 eval(shift(@insns)); #@
777 &psrlq ($t2,$sigma1[0]);
779 eval(shift(@insns)); #@
784 eval(shift(@insns)); #@
785 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
787 eval(shift(@insns)); #@
793 &pshufb ($t3,$t4); # sigma1(X[14..15])
795 eval(shift(@insns)); #@
798 eval(shift(@insns)); #@
799 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
801 &pshufd ($t3,@X[0],0b01010000); # X[16..17]
808 eval(shift(@insns)); #@
810 &psrld ($t3,$sigma1[2]);
812 &psrlq ($t2,$sigma1[0]);
814 eval(shift(@insns)); #@
819 eval(shift(@insns)); #@
821 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
822 eval(shift(@insns)); #@
829 &movdqa ($t2,16*2*$j."($Tbl)");
830 eval(shift(@insns)); #@
834 eval(shift(@insns)); #@
838 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
844 foreach (@insns) { eval; } # remaining instructions
845 &movdqa (16*$j."(%rsp)",$t2);
848 for ($i=0,$j=0; $j<4; $j++) {
849 &SSSE3_256_00_47($j,\&body_00_15,@X);
850 push(@X,shift(@X)); # rotate(@X)
852 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
853 &jne (".Lssse3_00_47");
855 for ($i=0; $i<16; ) {
856 foreach(body_00_15()) { eval; }
863 lea 16*$SZ($inp),$inp
886 $code.=<<___ if ($win64);
887 movaps 16*$SZ+32(%rsp),%xmm6
888 movaps 16*$SZ+48(%rsp),%xmm7
889 movaps 16*$SZ+64(%rsp),%xmm8
890 movaps 16*$SZ+80(%rsp),%xmm9
902 .size ${func}_ssse3,.-${func}_ssse3
907 ######################################################################
910 if ($SZ==8) { # SHA512 only
912 .type ${func}_xop,\@function,3
922 mov %rsp,%r11 # copy %rsp
923 shl \$4,%rdx # num*16
924 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
925 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
926 and \$-64,%rsp # align stack frame
927 mov $ctx,$_ctx # save ctx, 1st arg
928 mov $inp,$_inp # save inp, 2nd arh
929 mov %rdx,$_end # save end pointer, "3rd" arg
930 mov %r11,$_rsp # save copy of %rsp
932 $code.=<<___ if ($win64);
933 movaps %xmm6,16*$SZ+32(%rsp)
934 movaps %xmm7,16*$SZ+48(%rsp)
935 movaps %xmm8,16*$SZ+64(%rsp)
936 movaps %xmm9,16*$SZ+80(%rsp)
938 $code.=<<___ if ($win64 && $SZ>4);
939 movaps %xmm10,16*$SZ+96(%rsp)
940 movaps %xmm11,16*$SZ+112(%rsp)
956 if ($SZ==4) { # SHA256
957 my @X = map("%xmm$_",(0..3));
958 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
963 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
964 vmovdqu 0x00($inp),@X[0]
965 vmovdqu 0x10($inp),@X[1]
966 vmovdqu 0x20($inp),@X[2]
967 vmovdqu 0x30($inp),@X[3]
968 vpshufb $t3,@X[0],@X[0]
969 lea $TABLE(%rip),$Tbl
970 vpshufb $t3,@X[1],@X[1]
971 vpshufb $t3,@X[2],@X[2]
972 vpaddd 0x00($Tbl),@X[0],$t0
973 vpshufb $t3,@X[3],@X[3]
974 vpaddd 0x20($Tbl),@X[1],$t1
975 vpaddd 0x40($Tbl),@X[2],$t2
976 vpaddd 0x60($Tbl),@X[3],$t3
977 vmovdqa $t0,0x00(%rsp)
979 vmovdqa $t1,0x10(%rsp)
981 vmovdqa $t2,0x20(%rsp)
983 vmovdqa $t3,0x30(%rsp)
989 sub \$-16*2*$SZ,$Tbl # size optimization
991 sub XOP_256_00_47 () {
995 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
997 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
1000 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
1001 eval(shift(@insns));
1002 eval(shift(@insns));
1003 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
1004 eval(shift(@insns));
1005 eval(shift(@insns));
1006 &vpsrld ($t0,$t0,$sigma0[2]);
1007 eval(shift(@insns));
1008 eval(shift(@insns));
1009 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
1010 eval(shift(@insns));
1011 eval(shift(@insns));
1012 eval(shift(@insns));
1013 eval(shift(@insns));
1014 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
1015 eval(shift(@insns));
1016 eval(shift(@insns));
1017 &vpxor ($t0,$t0,$t1);
1018 eval(shift(@insns));
1019 eval(shift(@insns));
1020 eval(shift(@insns));
1021 eval(shift(@insns));
1022 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
1023 eval(shift(@insns));
1024 eval(shift(@insns));
1025 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
1026 eval(shift(@insns));
1027 eval(shift(@insns));
1028 &vpsrld ($t2,@X[3],$sigma1[2]);
1029 eval(shift(@insns));
1030 eval(shift(@insns));
1031 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
1032 eval(shift(@insns));
1033 eval(shift(@insns));
1034 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1035 eval(shift(@insns));
1036 eval(shift(@insns));
1037 &vpxor ($t3,$t3,$t2);
1038 eval(shift(@insns));
1039 eval(shift(@insns));
1040 eval(shift(@insns));
1041 eval(shift(@insns));
1042 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1043 eval(shift(@insns));
1044 eval(shift(@insns));
1045 eval(shift(@insns));
1046 eval(shift(@insns));
1047 &vpsrldq ($t3,$t3,8);
1048 eval(shift(@insns));
1049 eval(shift(@insns));
1050 eval(shift(@insns));
1051 eval(shift(@insns));
1052 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1053 eval(shift(@insns));
1054 eval(shift(@insns));
1055 eval(shift(@insns));
1056 eval(shift(@insns));
1057 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
1058 eval(shift(@insns));
1059 eval(shift(@insns));
1060 &vpsrld ($t2,@X[0],$sigma1[2]);
1061 eval(shift(@insns));
1062 eval(shift(@insns));
1063 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1064 eval(shift(@insns));
1065 eval(shift(@insns));
1066 &vpxor ($t3,$t3,$t2);
1067 eval(shift(@insns));
1068 eval(shift(@insns));
1069 eval(shift(@insns));
1070 eval(shift(@insns));
1071 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
1072 eval(shift(@insns));
1073 eval(shift(@insns));
1074 eval(shift(@insns));
1075 eval(shift(@insns));
1076 &vpslldq ($t3,$t3,8); # 22 instructions
1077 eval(shift(@insns));
1078 eval(shift(@insns));
1079 eval(shift(@insns));
1080 eval(shift(@insns));
1081 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1082 eval(shift(@insns));
1083 eval(shift(@insns));
1084 eval(shift(@insns));
1085 eval(shift(@insns));
1086 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1087 foreach (@insns) { eval; } # remaining instructions
1088 &vmovdqa (16*$j."(%rsp)",$t2);
1091 for ($i=0,$j=0; $j<4; $j++) {
1092 &XOP_256_00_47($j,\&body_00_15,@X);
1093 push(@X,shift(@X)); # rotate(@X)
1095 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1096 &jne (".Lxop_00_47");
1098 for ($i=0; $i<16; ) {
1099 foreach(body_00_15()) { eval; }
1103 my @X = map("%xmm$_",(0..7));
1104 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1109 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1110 vmovdqu 0x00($inp),@X[0]
1111 lea $TABLE+0x80(%rip),$Tbl # size optimization
1112 vmovdqu 0x10($inp),@X[1]
1113 vmovdqu 0x20($inp),@X[2]
1114 vpshufb $t3,@X[0],@X[0]
1115 vmovdqu 0x30($inp),@X[3]
1116 vpshufb $t3,@X[1],@X[1]
1117 vmovdqu 0x40($inp),@X[4]
1118 vpshufb $t3,@X[2],@X[2]
1119 vmovdqu 0x50($inp),@X[5]
1120 vpshufb $t3,@X[3],@X[3]
1121 vmovdqu 0x60($inp),@X[6]
1122 vpshufb $t3,@X[4],@X[4]
1123 vmovdqu 0x70($inp),@X[7]
1124 vpshufb $t3,@X[5],@X[5]
1125 vpaddq -0x80($Tbl),@X[0],$t0
1126 vpshufb $t3,@X[6],@X[6]
1127 vpaddq -0x60($Tbl),@X[1],$t1
1128 vpshufb $t3,@X[7],@X[7]
1129 vpaddq -0x40($Tbl),@X[2],$t2
1130 vpaddq -0x20($Tbl),@X[3],$t3
1131 vmovdqa $t0,0x00(%rsp)
1132 vpaddq 0x00($Tbl),@X[4],$t0
1133 vmovdqa $t1,0x10(%rsp)
1134 vpaddq 0x20($Tbl),@X[5],$t1
1135 vmovdqa $t2,0x20(%rsp)
1136 vpaddq 0x40($Tbl),@X[6],$t2
1137 vmovdqa $t3,0x30(%rsp)
1138 vpaddq 0x60($Tbl),@X[7],$t3
1139 vmovdqa $t0,0x40(%rsp)
1141 vmovdqa $t1,0x50(%rsp)
1143 vmovdqa $t2,0x60(%rsp)
1145 vmovdqa $t3,0x70(%rsp)
1153 sub XOP_512_00_47 () {
1157 my @insns = (&$body,&$body); # 52 instructions
1159 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
1160 eval(shift(@insns));
1161 eval(shift(@insns));
1162 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
1163 eval(shift(@insns));
1164 eval(shift(@insns));
1165 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
1166 eval(shift(@insns));
1167 eval(shift(@insns));
1168 &vpsrlq ($t0,$t0,$sigma0[2]);
1169 eval(shift(@insns));
1170 eval(shift(@insns));
1171 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
1172 eval(shift(@insns));
1173 eval(shift(@insns));
1174 eval(shift(@insns));
1175 eval(shift(@insns));
1176 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
1177 eval(shift(@insns));
1178 eval(shift(@insns));
1179 &vpxor ($t0,$t0,$t1);
1180 eval(shift(@insns));
1181 eval(shift(@insns));
1182 eval(shift(@insns));
1183 eval(shift(@insns));
1184 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
1185 eval(shift(@insns));
1186 eval(shift(@insns));
1187 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
1188 eval(shift(@insns));
1189 eval(shift(@insns));
1190 &vpsrlq ($t2,@X[7],$sigma1[2]);
1191 eval(shift(@insns));
1192 eval(shift(@insns));
1193 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
1194 eval(shift(@insns));
1195 eval(shift(@insns));
1196 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
1197 eval(shift(@insns));
1198 eval(shift(@insns));
1199 &vpxor ($t3,$t3,$t2);
1200 eval(shift(@insns));
1201 eval(shift(@insns));
1202 eval(shift(@insns));
1203 eval(shift(@insns));
1204 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1205 eval(shift(@insns));
1206 eval(shift(@insns));
1207 eval(shift(@insns));
1208 eval(shift(@insns));
1209 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1210 eval(shift(@insns));
1211 eval(shift(@insns));
1212 eval(shift(@insns));
1213 eval(shift(@insns));
1214 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1215 foreach (@insns) { eval; } # remaining instructions
1216 &vmovdqa (16*$j."(%rsp)",$t2);
1219 for ($i=0,$j=0; $j<8; $j++) {
1220 &XOP_512_00_47($j,\&body_00_15,@X);
1221 push(@X,shift(@X)); # rotate(@X)
1223 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1224 &jne (".Lxop_00_47");
1226 for ($i=0; $i<16; ) {
1227 foreach(body_00_15()) { eval; }
1235 lea 16*$SZ($inp),$inp
1259 $code.=<<___ if ($win64);
1260 movaps 16*$SZ+32(%rsp),%xmm6
1261 movaps 16*$SZ+48(%rsp),%xmm7
1262 movaps 16*$SZ+64(%rsp),%xmm8
1263 movaps 16*$SZ+80(%rsp),%xmm9
1265 $code.=<<___ if ($win64 && $SZ>4);
1266 movaps 16*$SZ+96(%rsp),%xmm10
1267 movaps 16*$SZ+112(%rsp),%xmm11
1279 .size ${func}_xop,.-${func}_xop
1282 ######################################################################
1283 # AVX+shrd code path
1285 local *ror = sub { &shrd(@_[0],@_) };
1288 .type ${func}_avx,\@function,3
1298 mov %rsp,%r11 # copy %rsp
1299 shl \$4,%rdx # num*16
1300 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1301 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1302 and \$-64,%rsp # align stack frame
1303 mov $ctx,$_ctx # save ctx, 1st arg
1304 mov $inp,$_inp # save inp, 2nd arh
1305 mov %rdx,$_end # save end pointer, "3rd" arg
1306 mov %r11,$_rsp # save copy of %rsp
1308 $code.=<<___ if ($win64);
1309 movaps %xmm6,16*$SZ+32(%rsp)
1310 movaps %xmm7,16*$SZ+48(%rsp)
1311 movaps %xmm8,16*$SZ+64(%rsp)
1312 movaps %xmm9,16*$SZ+80(%rsp)
1314 $code.=<<___ if ($win64 && $SZ>4);
1315 movaps %xmm10,16*$SZ+96(%rsp)
1316 movaps %xmm11,16*$SZ+112(%rsp)
1331 if ($SZ==4) { # SHA256
1332 my @X = map("%xmm$_",(0..3));
1333 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1336 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1337 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1341 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1342 vmovdqu 0x00($inp),@X[0]
1343 vmovdqu 0x10($inp),@X[1]
1344 vmovdqu 0x20($inp),@X[2]
1345 vmovdqu 0x30($inp),@X[3]
1346 vpshufb $t3,@X[0],@X[0]
1347 lea $TABLE(%rip),$Tbl
1348 vpshufb $t3,@X[1],@X[1]
1349 vpshufb $t3,@X[2],@X[2]
1350 vpaddd 0x00($Tbl),@X[0],$t0
1351 vpshufb $t3,@X[3],@X[3]
1352 vpaddd 0x20($Tbl),@X[1],$t1
1353 vpaddd 0x40($Tbl),@X[2],$t2
1354 vpaddd 0x60($Tbl),@X[3],$t3
1355 vmovdqa $t0,0x00(%rsp)
1357 vmovdqa $t1,0x10(%rsp)
1359 vmovdqa $t2,0x20(%rsp)
1361 vmovdqa $t3,0x30(%rsp)
1367 sub \$-16*2*$SZ,$Tbl # size optimization
1369 sub Xupdate_256_AVX () {
1371 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
1372 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
1373 '&vpsrld ($t2,$t0,$sigma0[0]);',
1374 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
1375 '&vpsrld ($t3,$t0,$sigma0[2])',
1376 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
1377 '&vpxor ($t0,$t3,$t2)',
1378 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
1379 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1380 '&vpxor ($t0,$t0,$t1)',
1381 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1382 '&vpxor ($t0,$t0,$t2)',
1383 '&vpsrld ($t2,$t3,$sigma1[2]);',
1384 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
1385 '&vpsrlq ($t3,$t3,$sigma1[0]);',
1386 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
1387 '&vpxor ($t2,$t2,$t3);',
1388 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1389 '&vpxor ($t2,$t2,$t3)',
1390 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
1391 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
1392 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
1393 '&vpsrld ($t2,$t3,$sigma1[2])',
1394 '&vpsrlq ($t3,$t3,$sigma1[0])',
1395 '&vpxor ($t2,$t2,$t3);',
1396 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1397 '&vpxor ($t2,$t2,$t3)',
1398 '&vpshufb ($t2,$t2,$t5)',
1399 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
1403 sub AVX_256_00_47 () {
1407 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1409 foreach (Xupdate_256_AVX()) { # 29 instructions
1411 eval(shift(@insns));
1412 eval(shift(@insns));
1413 eval(shift(@insns));
1415 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1416 foreach (@insns) { eval; } # remaining instructions
1417 &vmovdqa (16*$j."(%rsp)",$t2);
1420 for ($i=0,$j=0; $j<4; $j++) {
1421 &AVX_256_00_47($j,\&body_00_15,@X);
1422 push(@X,shift(@X)); # rotate(@X)
1424 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1425 &jne (".Lavx_00_47");
1427 for ($i=0; $i<16; ) {
1428 foreach(body_00_15()) { eval; }
1432 my @X = map("%xmm$_",(0..7));
1433 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1439 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1440 vmovdqu 0x00($inp),@X[0]
1441 lea $TABLE+0x80(%rip),$Tbl # size optimization
1442 vmovdqu 0x10($inp),@X[1]
1443 vmovdqu 0x20($inp),@X[2]
1444 vpshufb $t3,@X[0],@X[0]
1445 vmovdqu 0x30($inp),@X[3]
1446 vpshufb $t3,@X[1],@X[1]
1447 vmovdqu 0x40($inp),@X[4]
1448 vpshufb $t3,@X[2],@X[2]
1449 vmovdqu 0x50($inp),@X[5]
1450 vpshufb $t3,@X[3],@X[3]
1451 vmovdqu 0x60($inp),@X[6]
1452 vpshufb $t3,@X[4],@X[4]
1453 vmovdqu 0x70($inp),@X[7]
1454 vpshufb $t3,@X[5],@X[5]
1455 vpaddq -0x80($Tbl),@X[0],$t0
1456 vpshufb $t3,@X[6],@X[6]
1457 vpaddq -0x60($Tbl),@X[1],$t1
1458 vpshufb $t3,@X[7],@X[7]
1459 vpaddq -0x40($Tbl),@X[2],$t2
1460 vpaddq -0x20($Tbl),@X[3],$t3
1461 vmovdqa $t0,0x00(%rsp)
1462 vpaddq 0x00($Tbl),@X[4],$t0
1463 vmovdqa $t1,0x10(%rsp)
1464 vpaddq 0x20($Tbl),@X[5],$t1
1465 vmovdqa $t2,0x20(%rsp)
1466 vpaddq 0x40($Tbl),@X[6],$t2
1467 vmovdqa $t3,0x30(%rsp)
1468 vpaddq 0x60($Tbl),@X[7],$t3
1469 vmovdqa $t0,0x40(%rsp)
1471 vmovdqa $t1,0x50(%rsp)
1473 vmovdqa $t2,0x60(%rsp)
1475 vmovdqa $t3,0x70(%rsp)
1483 sub Xupdate_512_AVX () {
1485 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
1486 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
1487 '&vpsrlq ($t2,$t0,$sigma0[0])',
1488 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
1489 '&vpsrlq ($t3,$t0,$sigma0[2])',
1490 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
1491 '&vpxor ($t0,$t3,$t2)',
1492 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1493 '&vpxor ($t0,$t0,$t1)',
1494 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1495 '&vpxor ($t0,$t0,$t2)',
1496 '&vpsrlq ($t3,@X[7],$sigma1[2]);',
1497 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
1498 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
1499 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
1500 '&vpsrlq ($t1,@X[7],$sigma1[0]);',
1501 '&vpxor ($t3,$t3,$t2)',
1502 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
1503 '&vpxor ($t3,$t3,$t1)',
1504 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
1505 '&vpxor ($t3,$t3,$t2)',
1506 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
1507 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
1511 sub AVX_512_00_47 () {
1515 my @insns = (&$body,&$body); # 52 instructions
1517 foreach (Xupdate_512_AVX()) { # 23 instructions
1519 eval(shift(@insns));
1520 eval(shift(@insns));
1522 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1523 foreach (@insns) { eval; } # remaining instructions
1524 &vmovdqa (16*$j."(%rsp)",$t2);
1527 for ($i=0,$j=0; $j<8; $j++) {
1528 &AVX_512_00_47($j,\&body_00_15,@X);
1529 push(@X,shift(@X)); # rotate(@X)
1531 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1532 &jne (".Lavx_00_47");
1534 for ($i=0; $i<16; ) {
1535 foreach(body_00_15()) { eval; }
1543 lea 16*$SZ($inp),$inp
1567 $code.=<<___ if ($win64);
1568 movaps 16*$SZ+32(%rsp),%xmm6
1569 movaps 16*$SZ+48(%rsp),%xmm7
1570 movaps 16*$SZ+64(%rsp),%xmm8
1571 movaps 16*$SZ+80(%rsp),%xmm9
1573 $code.=<<___ if ($win64 && $SZ>4);
1574 movaps 16*$SZ+96(%rsp),%xmm10
1575 movaps 16*$SZ+112(%rsp),%xmm11
1587 .size ${func}_avx,.-${func}_avx
1591 ######################################################################
1592 # AVX2+BMI code path
1594 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
1598 sub bodyx_00_15 () {
1599 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1601 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1603 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
1604 '&and ($a4,$e)', # f&e
1605 '&rorx ($a0,$e,$Sigma1[2])',
1606 '&rorx ($a2,$e,$Sigma1[1])',
1608 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
1609 '&lea ($h,"($h,$a4)")',
1610 '&andn ($a4,$e,$g)', # ~e&g
1613 '&rorx ($a1,$e,$Sigma1[0])',
1614 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
1615 '&xor ($a0,$a1)', # Sigma1(e)
1618 '&rorx ($a4,$a,$Sigma0[2])',
1619 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
1620 '&xor ($a2,$b)', # a^b, b^c in next round
1621 '&rorx ($a1,$a,$Sigma0[1])',
1623 '&rorx ($a0,$a,$Sigma0[0])',
1624 '&lea ($d,"($d,$h)")', # d+=h
1625 '&and ($a3,$a2)', # (b^c)&(a^b)
1628 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
1629 '&xor ($a1,$a0)', # Sigma0(a)
1630 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
1631 '&mov ($a4,$e)', # copy of f in future
1633 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1635 # and at the finish one has to $a+=$a1
1639 .type ${func}_avx2,\@function,3
1649 mov %rsp,%r11 # copy %rsp
1650 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1651 shl \$4,%rdx # num*16
1652 and \$-256*$SZ,%rsp # align stack frame
1653 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1654 add \$`2*$SZ*($rounds-8)`,%rsp
1655 mov $ctx,$_ctx # save ctx, 1st arg
1656 mov $inp,$_inp # save inp, 2nd arh
1657 mov %rdx,$_end # save end pointer, "3rd" arg
1658 mov %r11,$_rsp # save copy of %rsp
1660 $code.=<<___ if ($win64);
1661 movaps %xmm6,16*$SZ+32(%rsp)
1662 movaps %xmm7,16*$SZ+48(%rsp)
1663 movaps %xmm8,16*$SZ+64(%rsp)
1664 movaps %xmm9,16*$SZ+80(%rsp)
1666 $code.=<<___ if ($win64 && $SZ>4);
1667 movaps %xmm10,16*$SZ+96(%rsp)
1668 movaps %xmm11,16*$SZ+112(%rsp)
1674 sub \$-16*$SZ,$inp # inp++, size optimization
1676 xor %r12,%r12 # borrow $T1
1678 cmp %rdx,$inp # $_end
1687 if ($SZ==4) { # SHA256
1688 my @X = map("%ymm$_",(0..3));
1689 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1692 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1693 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1697 shl \$`log(16*$SZ)/log(2)`,%r12
1698 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1700 vmovdqu -16*$SZ+0($inp),$t0
1702 vmovdqu -16*$SZ+32($inp),$t1
1703 vmovdqu (%r12),@X[2] # next or same input block
1704 vmovdqu 32(%r12),@X[3]
1706 vperm2i128 \$0x20,@X[2],$t0,@X[0]
1707 #mov $inp,$_inp # offload $inp
1708 vperm2i128 \$0x31,@X[2],$t0,@X[1]
1709 vperm2i128 \$0x20,@X[3],$t1,@X[2]
1710 vperm2i128 \$0x31,@X[3],$t1,@X[3]
1712 lea $TABLE(%rip),$Tbl
1713 vpshufb $t3,@X[0],@X[0]
1714 vpshufb $t3,@X[1],@X[1]
1715 vpshufb $t3,@X[2],@X[2]
1716 vpaddd 0x00($Tbl),@X[0],$t0
1717 vpshufb $t3,@X[3],@X[3]
1718 vpaddd 0x20($Tbl),@X[1],$t1
1719 vpaddd 0x40($Tbl),@X[2],$t2
1720 vpaddd 0x60($Tbl),@X[3],$t3
1721 vmovdqa $t0,0x00(%rsp)
1723 vmovdqa $t1,0x20(%rsp)
1724 lea -$PUSH8(%rsp),%rsp
1726 vmovdqa $t2,0x00(%rsp)
1728 vmovdqa $t3,0x20(%rsp)
1730 sub \$-16*2*$SZ,$Tbl # size optimization
1737 sub AVX2_256_00_47 () {
1741 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1742 my $base = "+2*$PUSH8(%rsp)";
1744 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1745 foreach (Xupdate_256_AVX()) { # 29 instructions
1747 eval(shift(@insns));
1748 eval(shift(@insns));
1749 eval(shift(@insns));
1751 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1752 foreach (@insns) { eval; } # remaining instructions
1753 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1756 for ($i=0,$j=0; $j<4; $j++) {
1757 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1758 push(@X,shift(@X)); # rotate(@X)
1760 &lea ($Tbl,16*2*$SZ."($Tbl)");
1761 &cmpb (($SZ-1)."($Tbl)",0);
1762 &jne (".Lavx2_00_47");
1764 for ($i=0; $i<16; ) {
1765 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1766 foreach(bodyx_00_15()) { eval; }
1769 my @X = map("%ymm$_",(0..7));
1770 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
1776 shl \$`log(16*$SZ)/log(2)`,%r12
1777 vmovdqu -16*$SZ($inp),$t0
1779 vmovdqu -16*$SZ+32($inp),$t1
1781 vmovdqu -16*$SZ+64($inp),$t2
1782 vmovdqu -16*$SZ+96($inp),$t3
1783 vmovdqu (%r12),@X[4] # next or same block
1784 vmovdqu 32(%r12),@X[5]
1785 vmovdqu 64(%r12),@X[6]
1786 vmovdqu 96(%r12),@X[7]
1788 vperm2i128 \$0x20,@X[4],$t0,@X[0]
1789 #mov $inp,$_inp # offload $inp
1790 vperm2i128 \$0x31,@X[4],$t0,@X[1]
1791 vperm2i128 \$0x20,@X[5],$t1,@X[2]
1792 vperm2i128 \$0x31,@X[5],$t1,@X[3]
1793 vperm2i128 \$0x20,@X[6],$t2,@X[4]
1794 vperm2i128 \$0x31,@X[6],$t2,@X[5]
1795 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t2
1796 vperm2i128 \$0x20,@X[7],$t3,@X[6]
1797 vperm2i128 \$0x31,@X[7],$t3,@X[7]
1799 vpshufb $t2,@X[0],@X[0]
1800 lea $TABLE+0x80(%rip),$Tbl # size optimization
1801 vpshufb $t2,@X[1],@X[1]
1802 vpshufb $t2,@X[2],@X[2]
1803 vpshufb $t2,@X[3],@X[3]
1804 vpshufb $t2,@X[4],@X[4]
1805 vpshufb $t2,@X[5],@X[5]
1806 vpaddq -0x80($Tbl),@X[0],$t0
1807 vpshufb $t2,@X[6],@X[6]
1808 vpaddq -0x60($Tbl),@X[1],$t1
1809 vpshufb $t2,@X[7],@X[7]
1810 vpaddq -0x40($Tbl),@X[2],$t2
1811 vpaddq -0x20($Tbl),@X[3],$t3
1812 vmovdqa $t0,0x00(%rsp)
1813 vpaddq 0x00($Tbl),@X[4],$t0
1814 vmovdqa $t1,0x20(%rsp)
1815 vpaddq 0x20($Tbl),@X[5],$t1
1816 vmovdqa $t2,0x40(%rsp)
1817 vpaddq 0x40($Tbl),@X[6],$t2
1818 vmovdqa $t3,0x60(%rsp)
1819 lea -$PUSH8(%rsp),%rsp
1820 vpaddq 0x60($Tbl),@X[7],$t3
1821 vmovdqa $t0,0x00(%rsp)
1823 vmovdqa $t1,0x20(%rsp)
1825 vmovdqa $t2,0x40(%rsp)
1827 vmovdqa $t3,0x60(%rsp)
1836 sub AVX2_512_00_47 () {
1840 my @insns = (&$body,&$body); # 48 instructions
1841 my $base = "+2*$PUSH8(%rsp)";
1843 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0);
1844 foreach (Xupdate_512_AVX()) { # 23 instructions
1847 eval(shift(@insns));
1848 eval(shift(@insns));
1849 eval(shift(@insns));
1852 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1853 foreach (@insns) { eval; } # remaining instructions
1854 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1857 for ($i=0,$j=0; $j<8; $j++) {
1858 &AVX2_512_00_47($j,\&bodyx_00_15,@X);
1859 push(@X,shift(@X)); # rotate(@X)
1861 &lea ($Tbl,16*2*$SZ."($Tbl)");
1862 &cmpb (($SZ-1-0x80)."($Tbl)",0);
1863 &jne (".Lavx2_00_47");
1865 for ($i=0; $i<16; ) {
1866 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1867 foreach(bodyx_00_15()) { eval; }
1871 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
1873 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
1874 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1894 cmp `$PUSH8+2*8`($Tbl),$inp # $_end
1905 for ($i=0; $i<8; ) {
1906 my $base="+16($Tbl)";
1907 foreach(bodyx_00_15()) { eval; }
1910 lea -$PUSH8($Tbl),$Tbl
1914 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
1916 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
1917 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1925 lea `2*16*$SZ`($inp),$inp # inp+=2
1949 $code.=<<___ if ($win64);
1950 movaps 16*$SZ+32(%rsp),%xmm6
1951 movaps 16*$SZ+48(%rsp),%xmm7
1952 movaps 16*$SZ+64(%rsp),%xmm8
1953 movaps 16*$SZ+80(%rsp),%xmm9
1955 $code.=<<___ if ($win64 && $SZ>4);
1956 movaps 16*$SZ+96(%rsp),%xmm10
1957 movaps 16*$SZ+112(%rsp),%xmm11
1969 .size ${func}_avx2,.-${func}_avx2
1974 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1975 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1983 .extern __imp_RtlVirtualUnwind
1984 .type se_handler,\@abi-omnipotent
1998 mov 120($context),%rax # pull context->Rax
1999 mov 248($context),%rbx # pull context->Rip
2001 mov 8($disp),%rsi # disp->ImageBase
2002 mov 56($disp),%r11 # disp->HanderlData
2004 mov 0(%r11),%r10d # HandlerData[0]
2005 lea (%rsi,%r10),%r10 # prologue label
2006 cmp %r10,%rbx # context->Rip<prologue label
2009 mov 152($context),%rax # pull context->Rsp
2011 mov 4(%r11),%r10d # HandlerData[1]
2012 lea (%rsi,%r10),%r10 # epilogue label
2013 cmp %r10,%rbx # context->Rip>=epilogue label
2016 $code.=<<___ if ($avx>1);
2017 lea .Lavx2_shortcut(%rip),%r10
2018 cmp %r10,%rbx # context->Rip<avx2_shortcut
2022 add \$`2*$SZ*($rounds-8)`,%rax
2026 mov %rax,%rsi # put aside Rsp
2027 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
2036 mov %rbx,144($context) # restore context->Rbx
2037 mov %rbp,160($context) # restore context->Rbp
2038 mov %r12,216($context) # restore context->R12
2039 mov %r13,224($context) # restore context->R13
2040 mov %r14,232($context) # restore context->R14
2041 mov %r15,240($context) # restore context->R15
2043 lea .Lepilogue(%rip),%r10
2045 jb .Lin_prologue # non-AVX code
2047 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
2048 lea 512($context),%rdi # &context.Xmm6
2049 mov \$`$SZ==4?8:12`,%ecx
2050 .long 0xa548f3fc # cld; rep movsq
2055 mov %rax,152($context) # restore context->Rsp
2056 mov %rsi,168($context) # restore context->Rsi
2057 mov %rdi,176($context) # restore context->Rdi
2059 mov 40($disp),%rdi # disp->ContextRecord
2060 mov $context,%rsi # context
2061 mov \$154,%ecx # sizeof(CONTEXT)
2062 .long 0xa548f3fc # cld; rep movsq
2065 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2066 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2067 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2068 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2069 mov 40(%rsi),%r10 # disp->ContextRecord
2070 lea 56(%rsi),%r11 # &disp->HandlerData
2071 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2072 mov %r10,32(%rsp) # arg5
2073 mov %r11,40(%rsp) # arg6
2074 mov %r12,48(%rsp) # arg7
2075 mov %rcx,56(%rsp) # arg8, (NULL)
2076 call *__imp_RtlVirtualUnwind(%rip)
2078 mov \$1,%eax # ExceptionContinueSearch
2090 .size se_handler,.-se_handler
2094 .rva .LSEH_begin_$func
2095 .rva .LSEH_end_$func
2096 .rva .LSEH_info_$func
2098 $code.=<<___ if ($SZ==4);
2099 .rva .LSEH_begin_${func}_ssse3
2100 .rva .LSEH_end_${func}_ssse3
2101 .rva .LSEH_info_${func}_ssse3
2103 $code.=<<___ if ($avx && $SZ==8);
2104 .rva .LSEH_begin_${func}_xop
2105 .rva .LSEH_end_${func}_xop
2106 .rva .LSEH_info_${func}_xop
2108 $code.=<<___ if ($avx);
2109 .rva .LSEH_begin_${func}_avx
2110 .rva .LSEH_end_${func}_avx
2111 .rva .LSEH_info_${func}_avx
2113 $code.=<<___ if ($avx>1);
2114 .rva .LSEH_begin_${func}_avx2
2115 .rva .LSEH_end_${func}_avx2
2116 .rva .LSEH_info_${func}_avx2
2124 .rva .Lprologue,.Lepilogue # HandlerData[]
2126 $code.=<<___ if ($SZ==4);
2127 .LSEH_info_${func}_ssse3:
2130 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
2132 $code.=<<___ if ($avx && $SZ==8);
2133 .LSEH_info_${func}_xop:
2136 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
2138 $code.=<<___ if ($avx);
2139 .LSEH_info_${func}_avx:
2142 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
2144 $code.=<<___ if ($avx>1);
2145 .LSEH_info_${func}_avx2:
2148 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2152 $code =~ s/\`([^\`]*)\`/eval $1/gem;