3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. Rights for redistribution and usage in source and binary
6 # forms are granted according to the OpenSSL license.
7 # ====================================================================
9 # sha256/512_block procedure for x86_64.
11 # 40% improvement over compiler-generated code on Opteron. On EM64T
12 # sha256 was observed to run >80% faster and sha512 - >40%. No magical
13 # tricks, just straight implementation... I really wonder why gcc
14 # [being armed with inline assembler] fails to generate as fast code.
15 # The only thing which is cool about this module is that it's very
16 # same instruction sequence used for both SHA-256 and SHA-512. In
17 # former case the instructions operate on 32-bit operands, while in
18 # latter - on 64-bit ones. All I had to do is to get one flavor right,
19 # the other one passed the test right away:-)
21 # sha256_block runs in ~1005 cycles on Opteron, which gives you
22 # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
23 # frequency in GHz. sha512_block runs in ~1275 cycles, which results
24 # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
25 # Well, if you compare it to IA-64 implementation, which maintains
26 # X[16] in register bank[!], tends to 4 instructions per CPU clock
27 # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
28 # issue Opteron pipeline and X[16] maintained in memory. So that *if*
29 # there is a way to improve it, *then* the only way would be to try to
30 # offload X[16] updates to SSE unit, but that would require "deeper"
31 # loop unroll, which in turn would naturally cause size blow-up, not
32 # to mention increased complexity! And once again, only *if* it's
33 # actually possible to noticeably improve overall ILP, instruction
34 # level parallelism, on a given CPU implementation in this case.
36 # Special note on Intel EM64T. While Opteron CPU exhibits perfect
37 # perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
38 # [currently available] EM64T CPUs apparently are far from it. On the
39 # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
40 # sha256_block:-( This is presumably because 64-bit shifts/rotates
41 # apparently are not atomic instructions, but implemented in microcode.
45 # Optimization including one of Pavel Semjanov's ideas, alternative
46 # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
47 # unfortunately -10% SHA512 on P4 [which nobody should care about
52 # Add SIMD code paths, see below for improvement coefficients. SSSE3
53 # code path was not attempted for SHA512, because improvement is not
54 # estimated to be high enough, noticeably less than 9%, to justify
55 # the effort, not on pre-AVX processors. [Obviously with exclusion
56 # for VIA Nano, but it has SHA512 instruction that is faster and
57 # should be used instead.] For reference, corresponding estimated
58 # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
59 # higher coefficients are observed on VIA Nano and Bulldozer has more
60 # to do with specifics of their architecture [which is topic for
61 # separate discussion].
65 # Add AVX2 code path. Two consecutive input blocks are loaded to
66 # 256-bit %ymm registers, with data from first block to least
67 # significant 128-bit halves and data from second to most significant.
68 # The data is then processed with same SIMD instruction sequence as
69 # for AVX, but with %ymm as operands. Side effect is increased stack
70 # frame, 448 additional bytes in SHA256 and 1152 in SHA512.
72 ######################################################################
73 # Current performance in cycles per processed byte (less is better):
75 # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
77 # AMD K8 15.1 - - 9.70 -
79 # Core 2 15.5 13.8(+12%) - 10.3 -
80 # Westmere 15.1 12.7(+19%) - 9.72 -
81 # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
82 # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
83 # Bulldozer 21.5 13.7(+57%) 13.7(+57%(***)) 13.5 8.58(+57%)
84 # VIA Nano 23.0 16.5(+39%) - 14.7 -
85 # Atom 23.0 18.7(+23%) - 14.7 -
87 # (*) whichever best applicable;
88 # (**) switch from ror to shrd stands for fair share of improvement;
89 # (***) execution time is fully determined by remaining integer-only
90 # part, body_00_15; reducing the amount of SIMD instructions
91 # below certain limit makes no difference/sense; to conserve
92 # space SHA256 XOP code path is therefore omitted;
96 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
98 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
100 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
101 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
102 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
103 die "can't locate x86_64-xlate.pl";
105 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
106 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
107 $avx = ($1>=2.19) + ($1>=2.22);
110 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
111 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
112 $avx = ($1>=2.09) + ($1>=2.10);
115 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
116 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
117 $avx = ($1>=10) + ($1>=11);
120 open OUT,"| \"$^X\" $xlate $flavour $output";
123 if ($output =~ /512/) {
124 $func="sha512_block_data_order";
127 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
128 "%r8", "%r9", "%r10","%r11");
129 ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
136 $func="sha256_block_data_order";
139 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
140 "%r8d","%r9d","%r10d","%r11d");
141 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
149 $ctx="%rdi"; # 1st arg, zapped by $a3
150 $inp="%rsi"; # 2nd arg
153 $_ctx="16*$SZ+0*8(%rsp)";
154 $_inp="16*$SZ+1*8(%rsp)";
155 $_end="16*$SZ+2*8(%rsp)";
156 $_rsp="16*$SZ+3*8(%rsp)";
157 $framesz="16*$SZ+4*8";
161 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
163 $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
166 ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
169 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
173 mov $T1,`$SZ*($i&0xf)`(%rsp)
177 ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
179 xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
181 ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
183 add $a2,$T1 # T1+=Ch(e,f,g)
186 add ($Tbl),$T1 # T1+=K[round]
189 ror \$$Sigma1[0],$a0 # Sigma1(e)
190 xor $b,$a2 # a^b, b^c in next round
193 ror \$$Sigma0[0],$a1 # Sigma0(a)
195 add $a0,$T1 # T1+=Sigma1(e)
197 xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
201 $code.=<<___ if ($i>=15);
202 mov `$SZ*(($i+2)&0xf)`(%rsp),$a0
205 lea $STRIDE($Tbl),$Tbl # round++
206 add $a1,$h # h+=Sigma0(a)
209 ($a2,$a3) = ($a3,$a2);
213 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
216 #mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
217 mov `$SZ*(($i+14)&0xf)`(%rsp),$a1
220 ror \$`$sigma0[1]-$sigma0[0]`,$a0
222 ror \$`$sigma1[1]-$sigma1[0]`,$a1
230 xor $a0,$T1 # sigma0(X[(i+1)&0xf])
232 add `$SZ*(($i+9)&0xf)`(%rsp),$T1
233 xor $a2,$a1 # sigma1(X[(i+14)&0xf])
235 add `$SZ*($i&0xf)`(%rsp),$T1
246 .extern OPENSSL_ia32cap_P
248 .type $func,\@function,3
252 $code.=<<___ if ($SZ==4 || $avx);
253 lea OPENSSL_ia32cap_P(%rip),%r11
258 $code.=<<___ if ($avx && $SZ==8);
259 test \$`1<<11`,%r10d # check for XOP
262 $code.=<<___ if ($avx>1);
263 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
264 cmp \$`1<<8|1<<5|1<<3`,%r11d
267 $code.=<<___ if ($avx);
268 and \$`1<<30`,%r9d # mask "Intel CPU" bit
269 and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
271 cmp \$`1<<28|1<<9|1<<30`,%r10d
274 $code.=<<___ if ($SZ==4);
285 mov %rsp,%r11 # copy %rsp
286 shl \$4,%rdx # num*16
288 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
289 and \$-64,%rsp # align stack frame
290 mov $ctx,$_ctx # save ctx, 1st arg
291 mov $inp,$_inp # save inp, 2nd arh
292 mov %rdx,$_end # save end pointer, "3rd" arg
293 mov %r11,$_rsp # save copy of %rsp
309 lea $TABLE(%rip),$Tbl
312 for($i=0;$i<16;$i++) {
313 $code.=" mov $SZ*$i($inp),$T1\n";
314 $code.=" mov @ROT[4],$a0\n";
315 $code.=" mov @ROT[0],$a1\n";
316 $code.=" bswap $T1\n";
317 &ROUND_00_15($i,@ROT);
318 unshift(@ROT,pop(@ROT));
326 &ROUND_16_XX($i,@ROT);
327 unshift(@ROT,pop(@ROT));
331 cmpb \$0,`$SZ-1`($Tbl)
335 lea 16*$SZ($inp),$inp
374 .type $TABLE,\@object
376 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
377 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
378 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
379 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
380 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
381 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
382 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
383 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
384 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
385 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
386 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
387 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
388 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
389 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
390 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
391 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
392 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
393 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
394 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
395 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
396 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
397 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
398 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
399 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
400 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
401 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
402 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
403 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
404 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
405 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
406 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
407 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
409 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
410 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
411 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
412 .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
413 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
414 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
415 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
420 .type $TABLE,\@object
422 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
423 .quad 0x428a2f98d728ae22,0x7137449123ef65cd
424 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
425 .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
426 .quad 0x3956c25bf348b538,0x59f111f1b605d019
427 .quad 0x3956c25bf348b538,0x59f111f1b605d019
428 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
429 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
430 .quad 0xd807aa98a3030242,0x12835b0145706fbe
431 .quad 0xd807aa98a3030242,0x12835b0145706fbe
432 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
433 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
434 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
435 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
436 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
437 .quad 0x9bdc06a725c71235,0xc19bf174cf692694
438 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
439 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
440 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
441 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
442 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
443 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
444 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
445 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
446 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
447 .quad 0x983e5152ee66dfab,0xa831c66d2db43210
448 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
449 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
450 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
451 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
452 .quad 0x06ca6351e003826f,0x142929670a0e6e70
453 .quad 0x06ca6351e003826f,0x142929670a0e6e70
454 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
455 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
456 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
457 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
458 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
459 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
460 .quad 0x81c2c92e47edaee6,0x92722c851482353b
461 .quad 0x81c2c92e47edaee6,0x92722c851482353b
462 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
463 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
464 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
465 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
466 .quad 0xd192e819d6ef5218,0xd69906245565a910
467 .quad 0xd192e819d6ef5218,0xd69906245565a910
468 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
469 .quad 0xf40e35855771202a,0x106aa07032bbd1b8
470 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
471 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
472 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
473 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
474 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
475 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
476 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
477 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
478 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
479 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
480 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
481 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
482 .quad 0x90befffa23631e28,0xa4506cebde82bde9
483 .quad 0x90befffa23631e28,0xa4506cebde82bde9
484 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
485 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
486 .quad 0xca273eceea26619c,0xd186b8c721c0c207
487 .quad 0xca273eceea26619c,0xd186b8c721c0c207
488 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
489 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
490 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
491 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
492 .quad 0x113f9804bef90dae,0x1b710b35131c471b
493 .quad 0x113f9804bef90dae,0x1b710b35131c471b
494 .quad 0x28db77f523047d84,0x32caab7b40c72493
495 .quad 0x28db77f523047d84,0x32caab7b40c72493
496 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
497 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
498 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
499 .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
500 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
501 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
503 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
504 .quad 0x0001020304050607,0x08090a0b0c0d0e0f
505 .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
509 ######################################################################
515 my ($a,$b,$c,$d,$e,$f,$g,$h);
517 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
518 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
520 $arg = "\$$arg" if ($arg*1 eq $arg);
521 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
526 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
528 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
533 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
534 '&xor ($a4,$g)', # f^g
536 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
538 '&and ($a4,$e)', # (f^g)&e
541 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
544 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
545 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
546 '&xor ($a2,$b)', # a^b, b^c in next round
548 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
549 '&add ($h,$a4)', # h+=Ch(e,f,g)
550 '&and ($a3,$a2)', # (b^c)&(a^b)
553 '&add ($h,$a0)', # h+=Sigma1(e)
554 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
556 '&add ($d,$h)', # d+=h
557 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
558 '&add ($h,$a3)', # h+=Maj(a,b,c)
561 '&add ($a1,$h);'. # h+=Sigma0(a)
562 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
566 ######################################################################
569 if ($SZ==4) { # SHA256 only
570 my @X = map("%xmm$_",(0..3));
571 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
574 .type ${func}_ssse3,\@function,3
584 mov %rsp,%r11 # copy %rsp
585 shl \$4,%rdx # num*16
586 sub \$`$framesz+$win64*16*4`,%rsp
587 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
588 and \$-64,%rsp # align stack frame
589 mov $ctx,$_ctx # save ctx, 1st arg
590 mov $inp,$_inp # save inp, 2nd arh
591 mov %rdx,$_end # save end pointer, "3rd" arg
592 mov %r11,$_rsp # save copy of %rsp
594 $code.=<<___ if ($win64);
595 movaps %xmm6,16*$SZ+32(%rsp)
596 movaps %xmm7,16*$SZ+48(%rsp)
597 movaps %xmm8,16*$SZ+64(%rsp)
598 movaps %xmm9,16*$SZ+80(%rsp)
614 #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
615 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
619 movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
620 movdqu 0x00($inp),@X[0]
621 movdqu 0x10($inp),@X[1]
622 movdqu 0x20($inp),@X[2]
623 movdqu 0x30($inp),@X[3]
625 lea $TABLE(%rip),$Tbl
627 movdqa 0x00($Tbl),$t0
629 movdqa 0x20($Tbl),$t1
631 movdqa 0x40($Tbl),$t2
633 movdqa 0x60($Tbl),$t3
637 movdqa $t0,0x00(%rsp)
639 movdqa $t1,0x10(%rsp)
641 movdqa $t2,0x20(%rsp)
643 movdqa $t3,0x30(%rsp)
649 sub \$-16*2*$SZ,$Tbl # size optimization
651 sub Xupdate_256_SSSE3 () {
653 '&movdqa ($t0,@X[1]);',
654 '&movdqa ($t3,@X[3])',
655 '&palignr ($t0,@X[0],$SZ)', # X[1..4]
656 '&palignr ($t3,@X[2],$SZ);', # X[9..12]
658 '&movdqa ($t2,$t0);',
659 '&psrld ($t0,$sigma0[2])',
660 '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
661 '&psrld ($t2,$sigma0[0])',
662 '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
663 '&pslld ($t1,8*$SZ-$sigma0[1]);'.
665 '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
667 '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
670 '&pxor ($t0,$t1);', # sigma0(X[1..4])
671 '&psrld ($t3,$sigma1[2])',
672 '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
673 '&psrlq ($t2,$sigma1[0])',
675 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
677 '&pshufb ($t3,$t4)', # sigma1(X[14..15])
678 '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
679 '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
680 '&movdqa ($t2,$t3);',
681 '&psrld ($t3,$sigma1[2])',
682 '&psrlq ($t2,$sigma1[0])',
684 '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
686 '&movdqa ($t2,16*2*$j."($Tbl)")',
688 '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
692 sub SSSE3_256_00_47 () {
696 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
699 foreach (Xupdate_256_SSSE3()) { # 36 instructions
705 } else { # squeeze extra 3% on Westmere and Atom
706 eval(shift(@insns)); #@
712 eval(shift(@insns)); #@
714 &palignr ($t0,@X[0],$SZ); # X[1..4]
715 eval(shift(@insns)); #@
717 &palignr ($t3,@X[2],$SZ); # X[9..12]
722 eval(shift(@insns)); #@
727 eval(shift(@insns)); #@
730 &psrld ($t0,$sigma0[2]);
734 &paddd (@X[0],$t3); # X[0..3] += X[9..12]
736 eval(shift(@insns)); #@
738 &psrld ($t2,$sigma0[0]);
741 eval(shift(@insns)); #@
743 &pshufd ($t3,@X[3],0b11111010); # X[4..15]
745 &pslld ($t1,8*$SZ-$sigma0[1]);
748 eval(shift(@insns)); #@
750 &psrld ($t2,$sigma0[1]-$sigma0[0]);
751 eval(shift(@insns)); #@
756 &pslld ($t1,$sigma0[1]-$sigma0[0]);
760 eval(shift(@insns)); #@
764 eval(shift(@insns)); #@
766 &pxor ($t0,$t1); # sigma0(X[1..4])
769 &psrld ($t3,$sigma1[2]);
772 &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
774 eval(shift(@insns)); #@
777 &psrlq ($t2,$sigma1[0]);
779 eval(shift(@insns)); #@
784 eval(shift(@insns)); #@
785 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
787 eval(shift(@insns)); #@
793 #&pshufb ($t3,$t4); # sigma1(X[14..15])
794 &pshufd ($t3,$t3,0b10000000);
796 eval(shift(@insns)); #@
800 eval(shift(@insns)); #@
801 &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
803 &pshufd ($t3,@X[0],0b01010000); # X[16..17]
810 eval(shift(@insns)); #@
812 &psrld ($t3,$sigma1[2]);
814 &psrlq ($t2,$sigma1[0]);
816 eval(shift(@insns)); #@
821 eval(shift(@insns)); #@
823 &psrlq ($t2,$sigma1[1]-$sigma1[0]);
824 eval(shift(@insns)); #@
831 &movdqa ($t2,16*2*$j."($Tbl)");
832 eval(shift(@insns)); #@
835 &pshufd ($t3,$t3,0b00001000);
837 eval(shift(@insns)); #@
842 &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
848 foreach (@insns) { eval; } # remaining instructions
849 &movdqa (16*$j."(%rsp)",$t2);
852 for ($i=0,$j=0; $j<4; $j++) {
853 &SSSE3_256_00_47($j,\&body_00_15,@X);
854 push(@X,shift(@X)); # rotate(@X)
856 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
857 &jne (".Lssse3_00_47");
859 for ($i=0; $i<16; ) {
860 foreach(body_00_15()) { eval; }
867 lea 16*$SZ($inp),$inp
890 $code.=<<___ if ($win64);
891 movaps 16*$SZ+32(%rsp),%xmm6
892 movaps 16*$SZ+48(%rsp),%xmm7
893 movaps 16*$SZ+64(%rsp),%xmm8
894 movaps 16*$SZ+80(%rsp),%xmm9
906 .size ${func}_ssse3,.-${func}_ssse3
911 ######################################################################
914 if ($SZ==8) { # SHA512 only
916 .type ${func}_xop,\@function,3
926 mov %rsp,%r11 # copy %rsp
927 shl \$4,%rdx # num*16
928 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
929 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
930 and \$-64,%rsp # align stack frame
931 mov $ctx,$_ctx # save ctx, 1st arg
932 mov $inp,$_inp # save inp, 2nd arh
933 mov %rdx,$_end # save end pointer, "3rd" arg
934 mov %r11,$_rsp # save copy of %rsp
936 $code.=<<___ if ($win64);
937 movaps %xmm6,16*$SZ+32(%rsp)
938 movaps %xmm7,16*$SZ+48(%rsp)
939 movaps %xmm8,16*$SZ+64(%rsp)
940 movaps %xmm9,16*$SZ+80(%rsp)
942 $code.=<<___ if ($win64 && $SZ>4);
943 movaps %xmm10,16*$SZ+96(%rsp)
944 movaps %xmm11,16*$SZ+112(%rsp)
960 if ($SZ==4) { # SHA256
961 my @X = map("%xmm$_",(0..3));
962 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
967 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
968 vmovdqu 0x00($inp),@X[0]
969 vmovdqu 0x10($inp),@X[1]
970 vmovdqu 0x20($inp),@X[2]
971 vmovdqu 0x30($inp),@X[3]
972 vpshufb $t3,@X[0],@X[0]
973 lea $TABLE(%rip),$Tbl
974 vpshufb $t3,@X[1],@X[1]
975 vpshufb $t3,@X[2],@X[2]
976 vpaddd 0x00($Tbl),@X[0],$t0
977 vpshufb $t3,@X[3],@X[3]
978 vpaddd 0x20($Tbl),@X[1],$t1
979 vpaddd 0x40($Tbl),@X[2],$t2
980 vpaddd 0x60($Tbl),@X[3],$t3
981 vmovdqa $t0,0x00(%rsp)
983 vmovdqa $t1,0x10(%rsp)
985 vmovdqa $t2,0x20(%rsp)
987 vmovdqa $t3,0x30(%rsp)
993 sub \$-16*2*$SZ,$Tbl # size optimization
995 sub XOP_256_00_47 () {
999 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1001 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
1002 eval(shift(@insns));
1003 eval(shift(@insns));
1004 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
1005 eval(shift(@insns));
1006 eval(shift(@insns));
1007 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
1008 eval(shift(@insns));
1009 eval(shift(@insns));
1010 &vpsrld ($t0,$t0,$sigma0[2]);
1011 eval(shift(@insns));
1012 eval(shift(@insns));
1013 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
1014 eval(shift(@insns));
1015 eval(shift(@insns));
1016 eval(shift(@insns));
1017 eval(shift(@insns));
1018 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
1019 eval(shift(@insns));
1020 eval(shift(@insns));
1021 &vpxor ($t0,$t0,$t1);
1022 eval(shift(@insns));
1023 eval(shift(@insns));
1024 eval(shift(@insns));
1025 eval(shift(@insns));
1026 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
1027 eval(shift(@insns));
1028 eval(shift(@insns));
1029 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
1030 eval(shift(@insns));
1031 eval(shift(@insns));
1032 &vpsrld ($t2,@X[3],$sigma1[2]);
1033 eval(shift(@insns));
1034 eval(shift(@insns));
1035 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
1036 eval(shift(@insns));
1037 eval(shift(@insns));
1038 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1039 eval(shift(@insns));
1040 eval(shift(@insns));
1041 &vpxor ($t3,$t3,$t2);
1042 eval(shift(@insns));
1043 eval(shift(@insns));
1044 eval(shift(@insns));
1045 eval(shift(@insns));
1046 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1047 eval(shift(@insns));
1048 eval(shift(@insns));
1049 eval(shift(@insns));
1050 eval(shift(@insns));
1051 &vpsrldq ($t3,$t3,8);
1052 eval(shift(@insns));
1053 eval(shift(@insns));
1054 eval(shift(@insns));
1055 eval(shift(@insns));
1056 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1057 eval(shift(@insns));
1058 eval(shift(@insns));
1059 eval(shift(@insns));
1060 eval(shift(@insns));
1061 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
1062 eval(shift(@insns));
1063 eval(shift(@insns));
1064 &vpsrld ($t2,@X[0],$sigma1[2]);
1065 eval(shift(@insns));
1066 eval(shift(@insns));
1067 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
1068 eval(shift(@insns));
1069 eval(shift(@insns));
1070 &vpxor ($t3,$t3,$t2);
1071 eval(shift(@insns));
1072 eval(shift(@insns));
1073 eval(shift(@insns));
1074 eval(shift(@insns));
1075 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
1076 eval(shift(@insns));
1077 eval(shift(@insns));
1078 eval(shift(@insns));
1079 eval(shift(@insns));
1080 &vpslldq ($t3,$t3,8); # 22 instructions
1081 eval(shift(@insns));
1082 eval(shift(@insns));
1083 eval(shift(@insns));
1084 eval(shift(@insns));
1085 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
1086 eval(shift(@insns));
1087 eval(shift(@insns));
1088 eval(shift(@insns));
1089 eval(shift(@insns));
1090 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1091 foreach (@insns) { eval; } # remaining instructions
1092 &vmovdqa (16*$j."(%rsp)",$t2);
1095 for ($i=0,$j=0; $j<4; $j++) {
1096 &XOP_256_00_47($j,\&body_00_15,@X);
1097 push(@X,shift(@X)); # rotate(@X)
1099 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1100 &jne (".Lxop_00_47");
1102 for ($i=0; $i<16; ) {
1103 foreach(body_00_15()) { eval; }
1107 my @X = map("%xmm$_",(0..7));
1108 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1113 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1114 vmovdqu 0x00($inp),@X[0]
1115 lea $TABLE+0x80(%rip),$Tbl # size optimization
1116 vmovdqu 0x10($inp),@X[1]
1117 vmovdqu 0x20($inp),@X[2]
1118 vpshufb $t3,@X[0],@X[0]
1119 vmovdqu 0x30($inp),@X[3]
1120 vpshufb $t3,@X[1],@X[1]
1121 vmovdqu 0x40($inp),@X[4]
1122 vpshufb $t3,@X[2],@X[2]
1123 vmovdqu 0x50($inp),@X[5]
1124 vpshufb $t3,@X[3],@X[3]
1125 vmovdqu 0x60($inp),@X[6]
1126 vpshufb $t3,@X[4],@X[4]
1127 vmovdqu 0x70($inp),@X[7]
1128 vpshufb $t3,@X[5],@X[5]
1129 vpaddq -0x80($Tbl),@X[0],$t0
1130 vpshufb $t3,@X[6],@X[6]
1131 vpaddq -0x60($Tbl),@X[1],$t1
1132 vpshufb $t3,@X[7],@X[7]
1133 vpaddq -0x40($Tbl),@X[2],$t2
1134 vpaddq -0x20($Tbl),@X[3],$t3
1135 vmovdqa $t0,0x00(%rsp)
1136 vpaddq 0x00($Tbl),@X[4],$t0
1137 vmovdqa $t1,0x10(%rsp)
1138 vpaddq 0x20($Tbl),@X[5],$t1
1139 vmovdqa $t2,0x20(%rsp)
1140 vpaddq 0x40($Tbl),@X[6],$t2
1141 vmovdqa $t3,0x30(%rsp)
1142 vpaddq 0x60($Tbl),@X[7],$t3
1143 vmovdqa $t0,0x40(%rsp)
1145 vmovdqa $t1,0x50(%rsp)
1147 vmovdqa $t2,0x60(%rsp)
1149 vmovdqa $t3,0x70(%rsp)
1157 sub XOP_512_00_47 () {
1161 my @insns = (&$body,&$body); # 52 instructions
1163 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
1164 eval(shift(@insns));
1165 eval(shift(@insns));
1166 &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
1167 eval(shift(@insns));
1168 eval(shift(@insns));
1169 &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
1170 eval(shift(@insns));
1171 eval(shift(@insns));
1172 &vpsrlq ($t0,$t0,$sigma0[2]);
1173 eval(shift(@insns));
1174 eval(shift(@insns));
1175 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
1176 eval(shift(@insns));
1177 eval(shift(@insns));
1178 eval(shift(@insns));
1179 eval(shift(@insns));
1180 &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
1181 eval(shift(@insns));
1182 eval(shift(@insns));
1183 &vpxor ($t0,$t0,$t1);
1184 eval(shift(@insns));
1185 eval(shift(@insns));
1186 eval(shift(@insns));
1187 eval(shift(@insns));
1188 &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
1189 eval(shift(@insns));
1190 eval(shift(@insns));
1191 &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
1192 eval(shift(@insns));
1193 eval(shift(@insns));
1194 &vpsrlq ($t2,@X[7],$sigma1[2]);
1195 eval(shift(@insns));
1196 eval(shift(@insns));
1197 &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
1198 eval(shift(@insns));
1199 eval(shift(@insns));
1200 &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
1201 eval(shift(@insns));
1202 eval(shift(@insns));
1203 &vpxor ($t3,$t3,$t2);
1204 eval(shift(@insns));
1205 eval(shift(@insns));
1206 eval(shift(@insns));
1207 eval(shift(@insns));
1208 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
1209 eval(shift(@insns));
1210 eval(shift(@insns));
1211 eval(shift(@insns));
1212 eval(shift(@insns));
1213 &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
1214 eval(shift(@insns));
1215 eval(shift(@insns));
1216 eval(shift(@insns));
1217 eval(shift(@insns));
1218 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1219 foreach (@insns) { eval; } # remaining instructions
1220 &vmovdqa (16*$j."(%rsp)",$t2);
1223 for ($i=0,$j=0; $j<8; $j++) {
1224 &XOP_512_00_47($j,\&body_00_15,@X);
1225 push(@X,shift(@X)); # rotate(@X)
1227 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1228 &jne (".Lxop_00_47");
1230 for ($i=0; $i<16; ) {
1231 foreach(body_00_15()) { eval; }
1239 lea 16*$SZ($inp),$inp
1263 $code.=<<___ if ($win64);
1264 movaps 16*$SZ+32(%rsp),%xmm6
1265 movaps 16*$SZ+48(%rsp),%xmm7
1266 movaps 16*$SZ+64(%rsp),%xmm8
1267 movaps 16*$SZ+80(%rsp),%xmm9
1269 $code.=<<___ if ($win64 && $SZ>4);
1270 movaps 16*$SZ+96(%rsp),%xmm10
1271 movaps 16*$SZ+112(%rsp),%xmm11
1283 .size ${func}_xop,.-${func}_xop
1286 ######################################################################
1287 # AVX+shrd code path
1289 local *ror = sub { &shrd(@_[0],@_) };
1292 .type ${func}_avx,\@function,3
1302 mov %rsp,%r11 # copy %rsp
1303 shl \$4,%rdx # num*16
1304 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
1305 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1306 and \$-64,%rsp # align stack frame
1307 mov $ctx,$_ctx # save ctx, 1st arg
1308 mov $inp,$_inp # save inp, 2nd arh
1309 mov %rdx,$_end # save end pointer, "3rd" arg
1310 mov %r11,$_rsp # save copy of %rsp
1312 $code.=<<___ if ($win64);
1313 movaps %xmm6,16*$SZ+32(%rsp)
1314 movaps %xmm7,16*$SZ+48(%rsp)
1315 movaps %xmm8,16*$SZ+64(%rsp)
1316 movaps %xmm9,16*$SZ+80(%rsp)
1318 $code.=<<___ if ($win64 && $SZ>4);
1319 movaps %xmm10,16*$SZ+96(%rsp)
1320 movaps %xmm11,16*$SZ+112(%rsp)
1335 if ($SZ==4) { # SHA256
1336 my @X = map("%xmm$_",(0..3));
1337 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
1340 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1341 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1345 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1346 vmovdqu 0x00($inp),@X[0]
1347 vmovdqu 0x10($inp),@X[1]
1348 vmovdqu 0x20($inp),@X[2]
1349 vmovdqu 0x30($inp),@X[3]
1350 vpshufb $t3,@X[0],@X[0]
1351 lea $TABLE(%rip),$Tbl
1352 vpshufb $t3,@X[1],@X[1]
1353 vpshufb $t3,@X[2],@X[2]
1354 vpaddd 0x00($Tbl),@X[0],$t0
1355 vpshufb $t3,@X[3],@X[3]
1356 vpaddd 0x20($Tbl),@X[1],$t1
1357 vpaddd 0x40($Tbl),@X[2],$t2
1358 vpaddd 0x60($Tbl),@X[3],$t3
1359 vmovdqa $t0,0x00(%rsp)
1361 vmovdqa $t1,0x10(%rsp)
1363 vmovdqa $t2,0x20(%rsp)
1365 vmovdqa $t3,0x30(%rsp)
1371 sub \$-16*2*$SZ,$Tbl # size optimization
1373 sub Xupdate_256_AVX () {
1375 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
1376 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
1377 '&vpsrld ($t2,$t0,$sigma0[0]);',
1378 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
1379 '&vpsrld ($t3,$t0,$sigma0[2])',
1380 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
1381 '&vpxor ($t0,$t3,$t2)',
1382 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
1383 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1384 '&vpxor ($t0,$t0,$t1)',
1385 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1386 '&vpxor ($t0,$t0,$t2)',
1387 '&vpsrld ($t2,$t3,$sigma1[2]);',
1388 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
1389 '&vpsrlq ($t3,$t3,$sigma1[0]);',
1390 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
1391 '&vpxor ($t2,$t2,$t3);',
1392 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1393 '&vpxor ($t2,$t2,$t3)',
1394 '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
1395 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
1396 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
1397 '&vpsrld ($t2,$t3,$sigma1[2])',
1398 '&vpsrlq ($t3,$t3,$sigma1[0])',
1399 '&vpxor ($t2,$t2,$t3);',
1400 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
1401 '&vpxor ($t2,$t2,$t3)',
1402 '&vpshufb ($t2,$t2,$t5)',
1403 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
1407 sub AVX_256_00_47 () {
1411 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
1413 foreach (Xupdate_256_AVX()) { # 29 instructions
1415 eval(shift(@insns));
1416 eval(shift(@insns));
1417 eval(shift(@insns));
1419 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1420 foreach (@insns) { eval; } # remaining instructions
1421 &vmovdqa (16*$j."(%rsp)",$t2);
1424 for ($i=0,$j=0; $j<4; $j++) {
1425 &AVX_256_00_47($j,\&body_00_15,@X);
1426 push(@X,shift(@X)); # rotate(@X)
1428 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
1429 &jne (".Lavx_00_47");
1431 for ($i=0; $i<16; ) {
1432 foreach(body_00_15()) { eval; }
1436 my @X = map("%xmm$_",(0..7));
1437 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
1443 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1444 vmovdqu 0x00($inp),@X[0]
1445 lea $TABLE+0x80(%rip),$Tbl # size optimization
1446 vmovdqu 0x10($inp),@X[1]
1447 vmovdqu 0x20($inp),@X[2]
1448 vpshufb $t3,@X[0],@X[0]
1449 vmovdqu 0x30($inp),@X[3]
1450 vpshufb $t3,@X[1],@X[1]
1451 vmovdqu 0x40($inp),@X[4]
1452 vpshufb $t3,@X[2],@X[2]
1453 vmovdqu 0x50($inp),@X[5]
1454 vpshufb $t3,@X[3],@X[3]
1455 vmovdqu 0x60($inp),@X[6]
1456 vpshufb $t3,@X[4],@X[4]
1457 vmovdqu 0x70($inp),@X[7]
1458 vpshufb $t3,@X[5],@X[5]
1459 vpaddq -0x80($Tbl),@X[0],$t0
1460 vpshufb $t3,@X[6],@X[6]
1461 vpaddq -0x60($Tbl),@X[1],$t1
1462 vpshufb $t3,@X[7],@X[7]
1463 vpaddq -0x40($Tbl),@X[2],$t2
1464 vpaddq -0x20($Tbl),@X[3],$t3
1465 vmovdqa $t0,0x00(%rsp)
1466 vpaddq 0x00($Tbl),@X[4],$t0
1467 vmovdqa $t1,0x10(%rsp)
1468 vpaddq 0x20($Tbl),@X[5],$t1
1469 vmovdqa $t2,0x20(%rsp)
1470 vpaddq 0x40($Tbl),@X[6],$t2
1471 vmovdqa $t3,0x30(%rsp)
1472 vpaddq 0x60($Tbl),@X[7],$t3
1473 vmovdqa $t0,0x40(%rsp)
1475 vmovdqa $t1,0x50(%rsp)
1477 vmovdqa $t2,0x60(%rsp)
1479 vmovdqa $t3,0x70(%rsp)
1487 sub Xupdate_512_AVX () {
1489 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
1490 '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
1491 '&vpsrlq ($t2,$t0,$sigma0[0])',
1492 '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
1493 '&vpsrlq ($t3,$t0,$sigma0[2])',
1494 '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
1495 '&vpxor ($t0,$t3,$t2)',
1496 '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
1497 '&vpxor ($t0,$t0,$t1)',
1498 '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
1499 '&vpxor ($t0,$t0,$t2)',
1500 '&vpsrlq ($t3,@X[7],$sigma1[2]);',
1501 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
1502 '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
1503 '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
1504 '&vpsrlq ($t1,@X[7],$sigma1[0]);',
1505 '&vpxor ($t3,$t3,$t2)',
1506 '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
1507 '&vpxor ($t3,$t3,$t1)',
1508 '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
1509 '&vpxor ($t3,$t3,$t2)',
1510 '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
1511 '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
1515 sub AVX_512_00_47 () {
1519 my @insns = (&$body,&$body); # 52 instructions
1521 foreach (Xupdate_512_AVX()) { # 23 instructions
1523 eval(shift(@insns));
1524 eval(shift(@insns));
1526 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1527 foreach (@insns) { eval; } # remaining instructions
1528 &vmovdqa (16*$j."(%rsp)",$t2);
1531 for ($i=0,$j=0; $j<8; $j++) {
1532 &AVX_512_00_47($j,\&body_00_15,@X);
1533 push(@X,shift(@X)); # rotate(@X)
1535 &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
1536 &jne (".Lavx_00_47");
1538 for ($i=0; $i<16; ) {
1539 foreach(body_00_15()) { eval; }
1547 lea 16*$SZ($inp),$inp
1571 $code.=<<___ if ($win64);
1572 movaps 16*$SZ+32(%rsp),%xmm6
1573 movaps 16*$SZ+48(%rsp),%xmm7
1574 movaps 16*$SZ+64(%rsp),%xmm8
1575 movaps 16*$SZ+80(%rsp),%xmm9
1577 $code.=<<___ if ($win64 && $SZ>4);
1578 movaps 16*$SZ+96(%rsp),%xmm10
1579 movaps 16*$SZ+112(%rsp),%xmm11
1591 .size ${func}_avx,.-${func}_avx
1595 ######################################################################
1596 # AVX2+BMI code path
1598 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
1602 sub bodyx_00_15 () {
1603 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
1605 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
1607 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
1608 '&and ($a4,$e)', # f&e
1609 '&rorx ($a0,$e,$Sigma1[2])',
1610 '&rorx ($a2,$e,$Sigma1[1])',
1612 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
1613 '&lea ($h,"($h,$a4)")',
1614 '&andn ($a4,$e,$g)', # ~e&g
1617 '&rorx ($a1,$e,$Sigma1[0])',
1618 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
1619 '&xor ($a0,$a1)', # Sigma1(e)
1622 '&rorx ($a4,$a,$Sigma0[2])',
1623 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
1624 '&xor ($a2,$b)', # a^b, b^c in next round
1625 '&rorx ($a1,$a,$Sigma0[1])',
1627 '&rorx ($a0,$a,$Sigma0[0])',
1628 '&lea ($d,"($d,$h)")', # d+=h
1629 '&and ($a3,$a2)', # (b^c)&(a^b)
1632 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
1633 '&xor ($a1,$a0)', # Sigma0(a)
1634 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
1635 '&mov ($a4,$e)', # copy of f in future
1637 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
1639 # and at the finish one has to $a+=$a1
1643 .type ${func}_avx2,\@function,3
1653 mov %rsp,%r11 # copy %rsp
1654 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
1655 shl \$4,%rdx # num*16
1656 and \$-256*$SZ,%rsp # align stack frame
1657 lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
1658 add \$`2*$SZ*($rounds-8)`,%rsp
1659 mov $ctx,$_ctx # save ctx, 1st arg
1660 mov $inp,$_inp # save inp, 2nd arh
1661 mov %rdx,$_end # save end pointer, "3rd" arg
1662 mov %r11,$_rsp # save copy of %rsp
1664 $code.=<<___ if ($win64);
1665 movaps %xmm6,16*$SZ+32(%rsp)
1666 movaps %xmm7,16*$SZ+48(%rsp)
1667 movaps %xmm8,16*$SZ+64(%rsp)
1668 movaps %xmm9,16*$SZ+80(%rsp)
1670 $code.=<<___ if ($win64 && $SZ>4);
1671 movaps %xmm10,16*$SZ+96(%rsp)
1672 movaps %xmm11,16*$SZ+112(%rsp)
1678 sub \$-16*$SZ,$inp # inp++, size optimization
1680 mov $inp,%r12 # borrow $T1
1682 cmp %rdx,$inp # $_end
1684 cmove %rsp,%r12 # next block or random data
1691 if ($SZ==4) { # SHA256
1692 my @X = map("%ymm$_",(0..3));
1693 my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
1696 vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
1697 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
1701 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1702 vmovdqu -16*$SZ+0($inp),%xmm0
1703 vmovdqu -16*$SZ+16($inp),%xmm1
1704 vmovdqu -16*$SZ+32($inp),%xmm2
1705 vmovdqu -16*$SZ+48($inp),%xmm3
1706 #mov $inp,$_inp # offload $inp
1707 vinserti128 \$1,(%r12),@X[0],@X[0]
1708 vinserti128 \$1,16(%r12),@X[1],@X[1]
1709 vpshufb $t3,@X[0],@X[0]
1710 vinserti128 \$1,32(%r12),@X[2],@X[2]
1711 vpshufb $t3,@X[1],@X[1]
1712 vinserti128 \$1,48(%r12),@X[3],@X[3]
1714 lea $TABLE(%rip),$Tbl
1715 vpshufb $t3,@X[2],@X[2]
1716 vpaddd 0x00($Tbl),@X[0],$t0
1717 vpshufb $t3,@X[3],@X[3]
1718 vpaddd 0x20($Tbl),@X[1],$t1
1719 vpaddd 0x40($Tbl),@X[2],$t2
1720 vpaddd 0x60($Tbl),@X[3],$t3
1721 vmovdqa $t0,0x00(%rsp)
1723 vmovdqa $t1,0x20(%rsp)
1724 lea -$PUSH8(%rsp),%rsp
1726 vmovdqa $t2,0x00(%rsp)
1728 vmovdqa $t3,0x20(%rsp)
1730 sub \$-16*2*$SZ,$Tbl # size optimization
1737 sub AVX2_256_00_47 () {
1741 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1742 my $base = "+2*$PUSH8(%rsp)";
1744 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1745 foreach (Xupdate_256_AVX()) { # 29 instructions
1747 eval(shift(@insns));
1748 eval(shift(@insns));
1749 eval(shift(@insns));
1751 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1752 foreach (@insns) { eval; } # remaining instructions
1753 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1756 for ($i=0,$j=0; $j<4; $j++) {
1757 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1758 push(@X,shift(@X)); # rotate(@X)
1760 &lea ($Tbl,16*2*$SZ."($Tbl)");
1761 &cmpb (($SZ-1)."($Tbl)",0);
1762 &jne (".Lavx2_00_47");
1764 for ($i=0; $i<16; ) {
1765 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1766 foreach(bodyx_00_15()) { eval; }
1769 my @X = map("%ymm$_",(0..7));
1770 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
1776 vmovdqu -16*$SZ($inp),%xmm0
1777 vmovdqu -16*$SZ+16($inp),%xmm1
1778 vmovdqu -16*$SZ+32($inp),%xmm2
1779 lea $TABLE+0x80(%rip),$Tbl # size optimization
1780 vmovdqu -16*$SZ+48($inp),%xmm3
1781 vmovdqu -16*$SZ+64($inp),%xmm4
1782 vmovdqu -16*$SZ+80($inp),%xmm5
1783 vmovdqu -16*$SZ+96($inp),%xmm6
1784 vmovdqu -16*$SZ+112($inp),%xmm7
1785 #mov $inp,$_inp # offload $inp
1786 vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
1787 vinserti128 \$1,(%r12),@X[0],@X[0]
1788 vinserti128 \$1,16(%r12),@X[1],@X[1]
1789 vpshufb $t2,@X[0],@X[0]
1790 vinserti128 \$1,32(%r12),@X[2],@X[2]
1791 vpshufb $t2,@X[1],@X[1]
1792 vinserti128 \$1,48(%r12),@X[3],@X[3]
1793 vpshufb $t2,@X[2],@X[2]
1794 vinserti128 \$1,64(%r12),@X[4],@X[4]
1795 vpshufb $t2,@X[3],@X[3]
1796 vinserti128 \$1,80(%r12),@X[5],@X[5]
1797 vpshufb $t2,@X[4],@X[4]
1798 vinserti128 \$1,96(%r12),@X[6],@X[6]
1799 vpshufb $t2,@X[5],@X[5]
1800 vinserti128 \$1,112(%r12),@X[7],@X[7]
1802 vpaddq -0x80($Tbl),@X[0],$t0
1803 vpshufb $t2,@X[6],@X[6]
1804 vpaddq -0x60($Tbl),@X[1],$t1
1805 vpshufb $t2,@X[7],@X[7]
1806 vpaddq -0x40($Tbl),@X[2],$t2
1807 vpaddq -0x20($Tbl),@X[3],$t3
1808 vmovdqa $t0,0x00(%rsp)
1809 vpaddq 0x00($Tbl),@X[4],$t0
1810 vmovdqa $t1,0x20(%rsp)
1811 vpaddq 0x20($Tbl),@X[5],$t1
1812 vmovdqa $t2,0x40(%rsp)
1813 vpaddq 0x40($Tbl),@X[6],$t2
1814 vmovdqa $t3,0x60(%rsp)
1815 lea -$PUSH8(%rsp),%rsp
1816 vpaddq 0x60($Tbl),@X[7],$t3
1817 vmovdqa $t0,0x00(%rsp)
1819 vmovdqa $t1,0x20(%rsp)
1821 vmovdqa $t2,0x40(%rsp)
1823 vmovdqa $t3,0x60(%rsp)
1832 sub AVX2_512_00_47 () {
1836 my @insns = (&$body,&$body); # 48 instructions
1837 my $base = "+2*$PUSH8(%rsp)";
1839 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0);
1840 foreach (Xupdate_512_AVX()) { # 23 instructions
1843 eval(shift(@insns));
1844 eval(shift(@insns));
1845 eval(shift(@insns));
1848 &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
1849 foreach (@insns) { eval; } # remaining instructions
1850 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1853 for ($i=0,$j=0; $j<8; $j++) {
1854 &AVX2_512_00_47($j,\&bodyx_00_15,@X);
1855 push(@X,shift(@X)); # rotate(@X)
1857 &lea ($Tbl,16*2*$SZ."($Tbl)");
1858 &cmpb (($SZ-1-0x80)."($Tbl)",0);
1859 &jne (".Lavx2_00_47");
1861 for ($i=0; $i<16; ) {
1862 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1863 foreach(bodyx_00_15()) { eval; }
1867 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
1869 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
1870 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1890 cmp `$PUSH8+2*8`($Tbl),$inp # $_end
1901 for ($i=0; $i<8; ) {
1902 my $base="+16($Tbl)";
1903 foreach(bodyx_00_15()) { eval; }
1906 lea -$PUSH8($Tbl),$Tbl
1910 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
1912 #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
1913 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1921 lea `2*16*$SZ`($inp),$inp # inp+=2
1928 cmove %rsp,%r12 # next block or stale data
1945 $code.=<<___ if ($win64);
1946 movaps 16*$SZ+32(%rsp),%xmm6
1947 movaps 16*$SZ+48(%rsp),%xmm7
1948 movaps 16*$SZ+64(%rsp),%xmm8
1949 movaps 16*$SZ+80(%rsp),%xmm9
1951 $code.=<<___ if ($win64 && $SZ>4);
1952 movaps 16*$SZ+96(%rsp),%xmm10
1953 movaps 16*$SZ+112(%rsp),%xmm11
1965 .size ${func}_avx2,.-${func}_avx2
1970 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1971 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1979 .extern __imp_RtlVirtualUnwind
1980 .type se_handler,\@abi-omnipotent
1994 mov 120($context),%rax # pull context->Rax
1995 mov 248($context),%rbx # pull context->Rip
1997 mov 8($disp),%rsi # disp->ImageBase
1998 mov 56($disp),%r11 # disp->HanderlData
2000 mov 0(%r11),%r10d # HandlerData[0]
2001 lea (%rsi,%r10),%r10 # prologue label
2002 cmp %r10,%rbx # context->Rip<prologue label
2005 mov 152($context),%rax # pull context->Rsp
2007 mov 4(%r11),%r10d # HandlerData[1]
2008 lea (%rsi,%r10),%r10 # epilogue label
2009 cmp %r10,%rbx # context->Rip>=epilogue label
2012 $code.=<<___ if ($avx>1);
2013 lea .Lavx2_shortcut(%rip),%r10
2014 cmp %r10,%rbx # context->Rip<avx2_shortcut
2018 add \$`2*$SZ*($rounds-8)`,%rax
2022 mov %rax,%rsi # put aside Rsp
2023 mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
2032 mov %rbx,144($context) # restore context->Rbx
2033 mov %rbp,160($context) # restore context->Rbp
2034 mov %r12,216($context) # restore context->R12
2035 mov %r13,224($context) # restore context->R13
2036 mov %r14,232($context) # restore context->R14
2037 mov %r15,240($context) # restore context->R15
2039 lea .Lepilogue(%rip),%r10
2041 jb .Lin_prologue # non-AVX code
2043 lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
2044 lea 512($context),%rdi # &context.Xmm6
2045 mov \$`$SZ==4?8:12`,%ecx
2046 .long 0xa548f3fc # cld; rep movsq
2051 mov %rax,152($context) # restore context->Rsp
2052 mov %rsi,168($context) # restore context->Rsi
2053 mov %rdi,176($context) # restore context->Rdi
2055 mov 40($disp),%rdi # disp->ContextRecord
2056 mov $context,%rsi # context
2057 mov \$154,%ecx # sizeof(CONTEXT)
2058 .long 0xa548f3fc # cld; rep movsq
2061 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2062 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2063 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2064 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2065 mov 40(%rsi),%r10 # disp->ContextRecord
2066 lea 56(%rsi),%r11 # &disp->HandlerData
2067 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2068 mov %r10,32(%rsp) # arg5
2069 mov %r11,40(%rsp) # arg6
2070 mov %r12,48(%rsp) # arg7
2071 mov %rcx,56(%rsp) # arg8, (NULL)
2072 call *__imp_RtlVirtualUnwind(%rip)
2074 mov \$1,%eax # ExceptionContinueSearch
2086 .size se_handler,.-se_handler
2090 .rva .LSEH_begin_$func
2091 .rva .LSEH_end_$func
2092 .rva .LSEH_info_$func
2094 $code.=<<___ if ($SZ==4);
2095 .rva .LSEH_begin_${func}_ssse3
2096 .rva .LSEH_end_${func}_ssse3
2097 .rva .LSEH_info_${func}_ssse3
2099 $code.=<<___ if ($avx && $SZ==8);
2100 .rva .LSEH_begin_${func}_xop
2101 .rva .LSEH_end_${func}_xop
2102 .rva .LSEH_info_${func}_xop
2104 $code.=<<___ if ($avx);
2105 .rva .LSEH_begin_${func}_avx
2106 .rva .LSEH_end_${func}_avx
2107 .rva .LSEH_info_${func}_avx
2109 $code.=<<___ if ($avx>1);
2110 .rva .LSEH_begin_${func}_avx2
2111 .rva .LSEH_end_${func}_avx2
2112 .rva .LSEH_info_${func}_avx2
2120 .rva .Lprologue,.Lepilogue # HandlerData[]
2122 $code.=<<___ if ($SZ==4);
2123 .LSEH_info_${func}_ssse3:
2126 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
2128 $code.=<<___ if ($avx && $SZ==8);
2129 .LSEH_info_${func}_xop:
2132 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
2134 $code.=<<___ if ($avx);
2135 .LSEH_info_${func}_avx:
2138 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
2140 $code.=<<___ if ($avx>1);
2141 .LSEH_info_${func}_avx2:
2144 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2148 $code =~ s/\`([^\`]*)\`/eval $1/gem;