2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # Multi-buffer SHA256 procedure processes n buffers in parallel by
18 # placing buffer data to designated lane of SIMD register. n is
19 # naturally limited to 4 on pre-AVX2 processors and to 8 on
20 # AVX2-capable processors such as Haswell.
22 # this +aesni(i) sha256 aesni-sha256 gain(iv)
23 # -------------------------------------------------------------------
24 # Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
25 # Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95%
26 # Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
27 # Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
28 # Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
29 # Skylake (18.9 +5.00=23.9)/n 7.70 8.17 +170%
30 # Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100%
32 # (i) multi-block CBC encrypt with 128-bit key;
33 # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
34 # because of lower AES-NI instruction throughput, nor is there
35 # AES-NI-SHA256 stitch for these processors;
36 # (iii) "this" is for n=8, when we gather twice as much data, result
37 # for n=4 is 20.3+4.44=24.7;
38 # (iv) presented improvement coefficients are asymptotic limits and
39 # in real-life application are somewhat lower, e.g. for 2KB
40 # fragments they range from 75% to 130% (on Haswell);
42 # $output is the last argument if it looks like a file (it has an extension)
43 # $flavour is the first argument if it doesn't look like a file
44 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
45 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
47 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52 die "can't locate x86_64-xlate.pl";
56 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
57 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
58 $avx = ($1>=2.19) + ($1>=2.22);
61 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
62 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
63 $avx = ($1>=2.09) + ($1>=2.10);
66 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
67 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
68 $avx = ($1>=10) + ($1>=11);
71 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
72 $avx = ($2>=3.0) + ($2>3.0);
75 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
76 or die "can't call $xlate: $!";
79 # void sha256_multi_block (
80 # struct { unsigned int A[8];
87 # unsigned int H[8]; } *ctx,
88 # struct { void *ptr; int blocks; } inp[8],
89 # int num); /* 1 or 2 */
91 $ctx="%rdi"; # 1st arg
92 $inp="%rsi"; # 2nd arg
93 $num="%edx"; # 3rd arg
94 @ptr=map("%r$_",(8..11));
97 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
98 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
105 $off %= 16; $off *= $REG_SZ;
106 $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
110 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
112 $code.=<<___ if ($i<15);
113 movd `4*$i`(@ptr[0]),$Xi
114 movd `4*$i`(@ptr[1]),$t1
115 movd `4*$i`(@ptr[2]),$t2
116 movd `4*$i`(@ptr[3]),$t3
121 $code.=<<___ if ($i==15);
122 movd `4*$i`(@ptr[0]),$Xi
123 lea `16*4`(@ptr[0]),@ptr[0]
124 movd `4*$i`(@ptr[1]),$t1
125 lea `16*4`(@ptr[1]),@ptr[1]
126 movd `4*$i`(@ptr[2]),$t2
127 lea `16*4`(@ptr[2]),@ptr[2]
128 movd `4*$i`(@ptr[3]),$t3
129 lea `16*4`(@ptr[3]),@ptr[3]
136 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)`
138 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)`
142 movdqa $Xi,`&Xi_off($i)`
148 paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round]
153 `"prefetcht0 63(@ptr[0])" if ($i==15)`
155 movdqa $e,$axb # borrow $axb
161 `"prefetcht0 63(@ptr[1])" if ($i==15)`
163 pxor $t3,$sigma # Sigma1(e)
166 paddd $sigma,$Xi # Xi+=Sigma1(e)
167 pxor $axb,$t1 # Ch(e,f,g)
171 pxor $a,$axb # a^b, b^c in next round
173 `"prefetcht0 63(@ptr[2])" if ($i==15)`
176 paddd $t1,$Xi # Xi+=Ch(e,f,g)
181 `"prefetcht0 63(@ptr[3])" if ($i==15)`
187 pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
189 pxor $t3,$sigma # Sigma0(a)
192 paddd $sigma,$h # h+=Sigma0(a)
194 $code.=<<___ if (($i%8)==7);
195 lea `32*8`($Tbl),$Tbl
197 ($axb,$bxc)=($bxc,$axb);
204 movdqa `&Xi_off($i+1)`,$Xn
205 paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9]
213 movdqa `&Xi_off($i+14)`,$t1
217 movdqa $t1,$axb # borrow $axb
225 pxor $t3,$sigma # sigma0(X[i+1])
227 paddd $sigma,$Xi # Xi+=sigma0(e)
233 pxor $t2,$t1 # sigma0(X[i+14])
234 paddd $t1,$Xi # Xi+=sigma1(X[i+14])
243 .extern OPENSSL_ia32cap_P
245 .globl sha256_multi_block
246 .type sha256_multi_block,\@function,3
250 mov OPENSSL_ia32cap_P+4(%rip),%rcx
251 bt \$61,%rcx # check SHA bit
254 $code.=<<___ if ($avx);
260 .cfi_def_cfa_register %rax
266 $code.=<<___ if ($win64);
269 movaps %xmm7,0x10(%rsp)
270 movaps %xmm8,0x20(%rsp)
271 movaps %xmm9,0x30(%rsp)
272 movaps %xmm10,-0x78(%rax)
273 movaps %xmm11,-0x68(%rax)
274 movaps %xmm12,-0x58(%rax)
275 movaps %xmm13,-0x48(%rax)
276 movaps %xmm14,-0x38(%rax)
277 movaps %xmm15,-0x28(%rax)
280 sub \$`$REG_SZ*18`, %rsp
282 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
283 .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
285 lea K256+128(%rip),$Tbl
286 lea `$REG_SZ*16`(%rsp),%rbx
287 lea 0x80($ctx),$ctx # size optimization
290 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
293 for($i=0;$i<4;$i++) {
295 mov `16*$i+0`($inp),@ptr[$i] # input pointer
296 mov `16*$i+8`($inp),%ecx # number of blocks
298 cmovg %ecx,$num # find maximum
300 mov %ecx,`4*$i`(%rbx) # initialize counters
301 cmovle $Tbl,@ptr[$i] # cancel input
308 movdqu 0x00-0x80($ctx),$A # load context
310 movdqu 0x20-0x80($ctx),$B
311 movdqu 0x40-0x80($ctx),$C
312 movdqu 0x60-0x80($ctx),$D
313 movdqu 0x80-0x80($ctx),$E
314 movdqu 0xa0-0x80($ctx),$F
315 movdqu 0xc0-0x80($ctx),$G
316 movdqu 0xe0-0x80($ctx),$H
317 movdqu .Lpbswap(%rip),$Xn
323 pxor $B,$bxc # magic seed
325 for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
327 movdqu `&Xi_off($i)`,$Xi
333 for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
339 lea K256+128(%rip),$Tbl
341 movdqa (%rbx),$sigma # pull counters
342 cmp 4*0(%rbx),%ecx # examine counters
344 cmovge $Tbl,@ptr[0] # cancel input
349 pcmpgtd $t1,$Xn # mask value
352 paddd $Xn,$sigma # counters--
355 movdqu 0x00-0x80($ctx),$t1
357 movdqu 0x20-0x80($ctx),$t2
359 movdqu 0x40-0x80($ctx),$t3
361 movdqu 0x60-0x80($ctx),$Xi
364 movdqu 0x80-0x80($ctx),$t1
367 movdqu 0xa0-0x80($ctx),$t2
370 movdqu 0xc0-0x80($ctx),$t3
373 movdqu 0xe0-0x80($ctx),$Xi
377 movdqu $A,0x00-0x80($ctx)
379 movdqu $B,0x20-0x80($ctx)
381 movdqu $C,0x40-0x80($ctx)
382 movdqu $D,0x60-0x80($ctx)
383 movdqu $E,0x80-0x80($ctx)
384 movdqu $F,0xa0-0x80($ctx)
385 movdqu $G,0xc0-0x80($ctx)
386 movdqu $H,0xe0-0x80($ctx)
388 movdqa $sigma,(%rbx) # save counters
389 movdqa .Lpbswap(%rip),$Xn
393 mov `$REG_SZ*17+8`(%rsp),$num
394 lea $REG_SZ($ctx),$ctx
395 lea `16*$REG_SZ/4`($inp),$inp
400 mov `$REG_SZ*17`(%rsp),%rax # original %rsp
403 $code.=<<___ if ($win64);
404 movaps -0xb8(%rax),%xmm6
405 movaps -0xa8(%rax),%xmm7
406 movaps -0x98(%rax),%xmm8
407 movaps -0x88(%rax),%xmm9
408 movaps -0x78(%rax),%xmm10
409 movaps -0x68(%rax),%xmm11
410 movaps -0x58(%rax),%xmm12
411 movaps -0x48(%rax),%xmm13
412 movaps -0x38(%rax),%xmm14
413 movaps -0x28(%rax),%xmm15
421 .cfi_def_cfa_register %rsp
425 .size sha256_multi_block,.-sha256_multi_block
428 my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
429 my @MSG0=map("%xmm$_",(4..7));
430 my @MSG1=map("%xmm$_",(8..11));
433 .type sha256_multi_block_shaext,\@function,3
435 sha256_multi_block_shaext:
439 .cfi_def_cfa_register %rax
445 $code.=<<___ if ($win64);
448 movaps %xmm7,0x10(%rsp)
449 movaps %xmm8,0x20(%rsp)
450 movaps %xmm9,0x30(%rsp)
451 movaps %xmm10,-0x78(%rax)
452 movaps %xmm11,-0x68(%rax)
453 movaps %xmm12,-0x58(%rax)
454 movaps %xmm13,-0x48(%rax)
455 movaps %xmm14,-0x38(%rax)
456 movaps %xmm15,-0x28(%rax)
459 sub \$`$REG_SZ*18`,%rsp
460 shl \$1,$num # we process pair at a time
462 lea 0x80($ctx),$ctx # size optimization
463 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
465 lea `$REG_SZ*16`(%rsp),%rbx
466 lea K256_shaext+0x80(%rip),$Tbl
469 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
472 for($i=0;$i<2;$i++) {
474 mov `16*$i+0`($inp),@ptr[$i] # input pointer
475 mov `16*$i+8`($inp),%ecx # number of blocks
477 cmovg %ecx,$num # find maximum
479 mov %ecx,`4*$i`(%rbx) # initialize counters
480 cmovle %rsp,@ptr[$i] # cancel input
487 movq 0x00-0x80($ctx),$ABEF0 # A1.A0
488 movq 0x20-0x80($ctx),@MSG0[0] # B1.B0
489 movq 0x40-0x80($ctx),$CDGH0 # C1.C0
490 movq 0x60-0x80($ctx),@MSG0[1] # D1.D0
491 movq 0x80-0x80($ctx),@MSG1[0] # E1.E0
492 movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0
493 movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0
494 movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0
496 punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0
497 punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0
498 punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0
499 punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0
500 movdqa K256_shaext-0x10(%rip),$TMPx # byte swap
504 punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0
505 punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0
506 punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1
507 punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1
509 pshufd \$0b00011011,$ABEF0,$ABEF0
510 pshufd \$0b00011011,$CDGH0,$CDGH0
511 pshufd \$0b00011011,$ABEF1,$ABEF1
512 pshufd \$0b00011011,$CDGH1,$CDGH1
517 movdqu 0x00(@ptr[0]),@MSG0[0]
518 movdqu 0x00(@ptr[1]),@MSG1[0]
519 movdqu 0x10(@ptr[0]),@MSG0[1]
520 movdqu 0x10(@ptr[1]),@MSG1[1]
521 movdqu 0x20(@ptr[0]),@MSG0[2]
522 pshufb $TMPx,@MSG0[0]
523 movdqu 0x20(@ptr[1]),@MSG1[2]
524 pshufb $TMPx,@MSG1[0]
525 movdqu 0x30(@ptr[0]),@MSG0[3]
526 lea 0x40(@ptr[0]),@ptr[0]
527 movdqu 0x30(@ptr[1]),@MSG1[3]
528 lea 0x40(@ptr[1]),@ptr[1]
530 movdqa 0*16-0x80($Tbl),$Wi
531 pshufb $TMPx,@MSG0[1]
533 pxor $ABEF0,@MSG0[0] # black magic
535 movdqa 0*16-0x80($Tbl),$TMP1
536 pshufb $TMPx,@MSG1[1]
538 movdqa $CDGH0,0x50(%rsp) # offload
539 sha256rnds2 $ABEF0,$CDGH0 # 0-3
540 pxor $ABEF1,@MSG1[0] # black magic
542 movdqa $CDGH1,0x70(%rsp)
543 sha256rnds2 $ABEF1,$CDGH1 # 0-3
544 pshufd \$0x0e,$TMP0,$Wi
545 pxor $ABEF0,@MSG0[0] # black magic
546 movdqa $ABEF0,0x40(%rsp) # offload
547 sha256rnds2 $CDGH0,$ABEF0
548 pshufd \$0x0e,$TMP1,$Wi
549 pxor $ABEF1,@MSG1[0] # black magic
550 movdqa $ABEF1,0x60(%rsp)
551 movdqa 1*16-0x80($Tbl),$TMP0
553 pshufb $TMPx,@MSG0[2]
554 sha256rnds2 $CDGH1,$ABEF1
557 movdqa 1*16-0x80($Tbl),$TMP1
559 sha256rnds2 $ABEF0,$CDGH0 # 4-7
561 prefetcht0 127(@ptr[0])
562 pshufb $TMPx,@MSG0[3]
563 pshufb $TMPx,@MSG1[2]
564 prefetcht0 127(@ptr[1])
565 sha256rnds2 $ABEF1,$CDGH1 # 4-7
566 pshufd \$0x0e,$TMP0,$Wi
567 pshufb $TMPx,@MSG1[3]
568 sha256msg1 @MSG0[1],@MSG0[0]
569 sha256rnds2 $CDGH0,$ABEF0
570 pshufd \$0x0e,$TMP1,$Wi
571 movdqa 2*16-0x80($Tbl),$TMP0
573 sha256rnds2 $CDGH1,$ABEF1
576 movdqa 2*16-0x80($Tbl),$TMP1
578 sha256rnds2 $ABEF0,$CDGH0 # 8-11
579 sha256msg1 @MSG1[1],@MSG1[0]
581 movdqa @MSG0[3],$TMPx
582 sha256rnds2 $ABEF1,$CDGH1 # 8-11
583 pshufd \$0x0e,$TMP0,$Wi
584 palignr \$4,@MSG0[2],$TMPx
586 movdqa @MSG1[3],$TMPx
587 palignr \$4,@MSG1[2],$TMPx
588 sha256msg1 @MSG0[2],@MSG0[1]
589 sha256rnds2 $CDGH0,$ABEF0
590 pshufd \$0x0e,$TMP1,$Wi
591 movdqa 3*16-0x80($Tbl),$TMP0
593 sha256rnds2 $CDGH1,$ABEF1
594 sha256msg1 @MSG1[2],@MSG1[1]
597 movdqa 3*16-0x80($Tbl),$TMP1
600 sha256msg2 @MSG0[3],@MSG0[0]
601 sha256rnds2 $ABEF0,$CDGH0 # 12-15
603 movdqa @MSG0[0],$TMPx
604 palignr \$4,@MSG0[3],$TMPx
605 sha256rnds2 $ABEF1,$CDGH1 # 12-15
606 sha256msg2 @MSG1[3],@MSG1[0]
607 pshufd \$0x0e,$TMP0,$Wi
609 movdqa @MSG1[0],$TMPx
610 palignr \$4,@MSG1[3],$TMPx
611 sha256msg1 @MSG0[3],@MSG0[2]
612 sha256rnds2 $CDGH0,$ABEF0
613 pshufd \$0x0e,$TMP1,$Wi
614 movdqa 4*16-0x80($Tbl),$TMP0
616 sha256rnds2 $CDGH1,$ABEF1
617 sha256msg1 @MSG1[3],@MSG1[2]
619 for($i=4;$i<16-3;$i++) {
622 movdqa $i*16-0x80($Tbl),$TMP1
625 sha256msg2 @MSG0[0],@MSG0[1]
626 sha256rnds2 $ABEF0,$CDGH0 # 16-19...
628 movdqa @MSG0[1],$TMPx
629 palignr \$4,@MSG0[0],$TMPx
630 sha256rnds2 $ABEF1,$CDGH1 # 16-19...
631 sha256msg2 @MSG1[0],@MSG1[1]
632 pshufd \$0x0e,$TMP0,$Wi
634 movdqa @MSG1[1],$TMPx
635 palignr \$4,@MSG1[0],$TMPx
636 sha256msg1 @MSG0[0],@MSG0[3]
637 sha256rnds2 $CDGH0,$ABEF0
638 pshufd \$0x0e,$TMP1,$Wi
639 movdqa `($i+1)*16`-0x80($Tbl),$TMP0
641 sha256rnds2 $CDGH1,$ABEF1
642 sha256msg1 @MSG1[0],@MSG1[3]
644 push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
648 movdqa 13*16-0x80($Tbl),$TMP1
651 sha256msg2 @MSG0[0],@MSG0[1]
652 sha256rnds2 $ABEF0,$CDGH0 # 52-55
654 movdqa @MSG0[1],$TMPx
655 palignr \$4,@MSG0[0],$TMPx
656 sha256rnds2 $ABEF1,$CDGH1 # 52-55
657 sha256msg2 @MSG1[0],@MSG1[1]
658 pshufd \$0x0e,$TMP0,$Wi
660 movdqa @MSG1[1],$TMPx
661 palignr \$4,@MSG1[0],$TMPx
663 sha256rnds2 $CDGH0,$ABEF0
664 pshufd \$0x0e,$TMP1,$Wi
665 movdqa 14*16-0x80($Tbl),$TMP0
667 sha256rnds2 $CDGH1,$ABEF1
670 movdqa 14*16-0x80($Tbl),$TMP1
673 sha256msg2 @MSG0[1],@MSG0[2]
675 sha256rnds2 $ABEF0,$CDGH0 # 56-59
678 pxor @MSG0[1],@MSG0[1] # zero
679 sha256rnds2 $ABEF1,$CDGH1 # 56-59
680 sha256msg2 @MSG1[1],@MSG1[2]
681 pshufd \$0x0e,$TMP0,$Wi
682 movdqa 15*16-0x80($Tbl),$TMP0
684 movq (%rbx),@MSG0[2] # pull counters
686 sha256rnds2 $CDGH0,$ABEF0
687 pshufd \$0x0e,$TMP1,$Wi
688 movdqa 15*16-0x80($Tbl),$TMP1
690 sha256rnds2 $CDGH1,$ABEF1
693 cmp 4*0(%rbx),%ecx # examine counters
694 cmovge %rsp,@ptr[0] # cancel input
697 pshufd \$0x00,@MSG0[2],@MSG1[0]
698 sha256rnds2 $ABEF0,$CDGH0 # 60-63
700 pshufd \$0x55,@MSG0[2],@MSG1[1]
701 movdqa @MSG0[2],@MSG1[2]
702 sha256rnds2 $ABEF1,$CDGH1 # 60-63
703 pshufd \$0x0e,$TMP0,$Wi
704 pcmpgtd @MSG0[1],@MSG1[0]
705 pcmpgtd @MSG0[1],@MSG1[1]
706 sha256rnds2 $CDGH0,$ABEF0
707 pshufd \$0x0e,$TMP1,$Wi
708 pcmpgtd @MSG0[1],@MSG1[2] # counter mask
709 movdqa K256_shaext-0x10(%rip),$TMPx
710 sha256rnds2 $CDGH1,$ABEF1
716 paddd @MSG0[2],@MSG1[2] # counters--
718 paddd 0x50(%rsp),$CDGH0
719 paddd 0x70(%rsp),$CDGH1
720 paddd 0x40(%rsp),$ABEF0
721 paddd 0x60(%rsp),$ABEF1
723 movq @MSG1[2],(%rbx) # save counters
727 mov `$REG_SZ*17+8`(%rsp),$num
729 pshufd \$0b00011011,$ABEF0,$ABEF0
730 pshufd \$0b00011011,$CDGH0,$CDGH0
731 pshufd \$0b00011011,$ABEF1,$ABEF1
732 pshufd \$0b00011011,$CDGH1,$CDGH1
734 movdqa $ABEF0,@MSG0[0]
735 movdqa $CDGH0,@MSG0[1]
736 punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0
737 punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0
738 punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0
739 punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0
741 movq $ABEF0,0x00-0x80($ctx) # A1.A0
743 movq @MSG0[0],0x80-0x80($ctx) # E1.E0
745 movq $ABEF0,0x20-0x80($ctx) # B1.B0
746 movq @MSG0[0],0xa0-0x80($ctx) # F1.F0
748 movq $CDGH0,0x40-0x80($ctx) # C1.C0
750 movq @MSG0[1],0xc0-0x80($ctx) # G1.G0
752 movq $CDGH0,0x60-0x80($ctx) # D1.D0
753 movq @MSG0[1],0xe0-0x80($ctx) # H1.H0
755 lea `$REG_SZ/2`($ctx),$ctx
756 lea `16*2`($inp),$inp
758 jnz .Loop_grande_shaext
761 #mov `$REG_SZ*17`(%rsp),%rax # original %rsp
763 $code.=<<___ if ($win64);
764 movaps -0xb8(%rax),%xmm6
765 movaps -0xa8(%rax),%xmm7
766 movaps -0x98(%rax),%xmm8
767 movaps -0x88(%rax),%xmm9
768 movaps -0x78(%rax),%xmm10
769 movaps -0x68(%rax),%xmm11
770 movaps -0x58(%rax),%xmm12
771 movaps -0x48(%rax),%xmm13
772 movaps -0x38(%rax),%xmm14
773 movaps -0x28(%rax),%xmm15
781 .cfi_def_cfa_register %rsp
785 .size sha256_multi_block_shaext,.-sha256_multi_block_shaext
789 sub ROUND_00_15_avx {
790 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
792 $code.=<<___ if ($i<15 && $REG_SZ==16);
793 vmovd `4*$i`(@ptr[0]),$Xi
794 vmovd `4*$i`(@ptr[1]),$t1
795 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
796 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
797 vpunpckldq $t1,$Xi,$Xi
800 $code.=<<___ if ($i==15 && $REG_SZ==16);
801 vmovd `4*$i`(@ptr[0]),$Xi
802 lea `16*4`(@ptr[0]),@ptr[0]
803 vmovd `4*$i`(@ptr[1]),$t1
804 lea `16*4`(@ptr[1]),@ptr[1]
805 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
806 lea `16*4`(@ptr[2]),@ptr[2]
807 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
808 lea `16*4`(@ptr[3]),@ptr[3]
809 vpunpckldq $t1,$Xi,$Xi
812 $code.=<<___ if ($i<15 && $REG_SZ==32);
813 vmovd `4*$i`(@ptr[0]),$Xi
814 vmovd `4*$i`(@ptr[4]),$t1
815 vmovd `4*$i`(@ptr[1]),$t2
816 vmovd `4*$i`(@ptr[5]),$t3
817 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
818 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
819 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
820 vpunpckldq $t2,$Xi,$Xi
821 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
822 vpunpckldq $t3,$t1,$t1
823 vinserti128 $t1,$Xi,$Xi
826 $code.=<<___ if ($i==15 && $REG_SZ==32);
827 vmovd `4*$i`(@ptr[0]),$Xi
828 lea `16*4`(@ptr[0]),@ptr[0]
829 vmovd `4*$i`(@ptr[4]),$t1
830 lea `16*4`(@ptr[4]),@ptr[4]
831 vmovd `4*$i`(@ptr[1]),$t2
832 lea `16*4`(@ptr[1]),@ptr[1]
833 vmovd `4*$i`(@ptr[5]),$t3
834 lea `16*4`(@ptr[5]),@ptr[5]
835 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
836 lea `16*4`(@ptr[2]),@ptr[2]
837 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
838 lea `16*4`(@ptr[6]),@ptr[6]
839 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
840 lea `16*4`(@ptr[3]),@ptr[3]
841 vpunpckldq $t2,$Xi,$Xi
842 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
843 lea `16*4`(@ptr[7]),@ptr[7]
844 vpunpckldq $t3,$t1,$t1
845 vinserti128 $t1,$Xi,$Xi
851 vmovdqu $Xi,`&Xi_off($i)`
852 vpaddd $h,$Xi,$Xi # Xi+=h
855 vpxor $t3,$sigma,$sigma
857 vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round]
858 vpxor $t2,$sigma,$sigma
861 vpxor $t3,$sigma,$sigma
862 `"prefetcht0 63(@ptr[0])" if ($i==15)`
865 vpand $f,$e,$axb # borrow $axb
866 `"prefetcht0 63(@ptr[1])" if ($i==15)`
867 vpxor $t2,$sigma,$sigma
869 vpsrld \$2,$a,$h # borrow $h
870 vpxor $t3,$sigma,$sigma # Sigma1(e)
871 `"prefetcht0 63(@ptr[2])" if ($i==15)`
873 vpxor $axb,$t1,$t1 # Ch(e,f,g)
874 vpxor $a,$b,$axb # a^b, b^c in next round
875 `"prefetcht0 63(@ptr[3])" if ($i==15)`
877 vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
880 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
882 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
884 `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
888 vpxor $t3,$sigma,$sigma
889 `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
891 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
892 vpaddd $Xi,$d,$d # d+=Xi
893 `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
894 vpxor $t2,$sigma,$sigma
895 vpxor $t3,$sigma,$sigma # Sigma0(a)
897 vpaddd $Xi,$h,$h # h+=Xi
898 vpaddd $sigma,$h,$h # h+=Sigma0(a)
900 $code.=<<___ if (($i%8)==7);
903 ($axb,$bxc)=($bxc,$axb);
906 sub ROUND_16_XX_avx {
910 vmovdqu `&Xi_off($i+1)`,$Xn
911 vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9]
913 vpsrld \$3,$Xn,$sigma
916 vpxor $t2,$sigma,$sigma
918 vpxor $t3,$sigma,$sigma
920 vmovdqu `&Xi_off($i+14)`,$t1
921 vpsrld \$10,$t1,$axb # borrow $axb
923 vpxor $t2,$sigma,$sigma
925 vpxor $t3,$sigma,$sigma # sigma0(X[i+1])
927 vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e)
928 vpxor $t2,$axb,$sigma
930 vpxor $t3,$sigma,$sigma
932 vpxor $t2,$sigma,$sigma
933 vpxor $t3,$sigma,$sigma # sigma0(X[i+14])
934 vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14])
936 &ROUND_00_15_avx($i,@_);
941 .type sha256_multi_block_avx,\@function,3
943 sha256_multi_block_avx:
947 $code.=<<___ if ($avx>1);
959 .cfi_def_cfa_register %rax
965 $code.=<<___ if ($win64);
968 movaps %xmm7,0x10(%rsp)
969 movaps %xmm8,0x20(%rsp)
970 movaps %xmm9,0x30(%rsp)
971 movaps %xmm10,-0x78(%rax)
972 movaps %xmm11,-0x68(%rax)
973 movaps %xmm12,-0x58(%rax)
974 movaps %xmm13,-0x48(%rax)
975 movaps %xmm14,-0x38(%rax)
976 movaps %xmm15,-0x28(%rax)
979 sub \$`$REG_SZ*18`, %rsp
981 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
982 .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
984 lea K256+128(%rip),$Tbl
985 lea `$REG_SZ*16`(%rsp),%rbx
986 lea 0x80($ctx),$ctx # size optimization
989 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
992 for($i=0;$i<4;$i++) {
994 mov `16*$i+0`($inp),@ptr[$i] # input pointer
995 mov `16*$i+8`($inp),%ecx # number of blocks
997 cmovg %ecx,$num # find maximum
999 mov %ecx,`4*$i`(%rbx) # initialize counters
1000 cmovle $Tbl,@ptr[$i] # cancel input
1007 vmovdqu 0x00-0x80($ctx),$A # load context
1009 vmovdqu 0x20-0x80($ctx),$B
1010 vmovdqu 0x40-0x80($ctx),$C
1011 vmovdqu 0x60-0x80($ctx),$D
1012 vmovdqu 0x80-0x80($ctx),$E
1013 vmovdqu 0xa0-0x80($ctx),$F
1014 vmovdqu 0xc0-0x80($ctx),$G
1015 vmovdqu 0xe0-0x80($ctx),$H
1016 vmovdqu .Lpbswap(%rip),$Xn
1021 vpxor $B,$C,$bxc # magic seed
1023 for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1025 vmovdqu `&Xi_off($i)`,$Xi
1031 for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1037 lea K256+128(%rip),$Tbl
1039 for($i=0;$i<4;$i++) {
1041 cmp `4*$i`(%rbx),%ecx # examine counters
1042 cmovge $Tbl,@ptr[$i] # cancel input
1046 vmovdqa (%rbx),$sigma # pull counters
1049 vpcmpgtd $t1,$Xn,$Xn # mask value
1050 vpaddd $Xn,$sigma,$sigma # counters--
1052 vmovdqu 0x00-0x80($ctx),$t1
1054 vmovdqu 0x20-0x80($ctx),$t2
1056 vmovdqu 0x40-0x80($ctx),$t3
1058 vmovdqu 0x60-0x80($ctx),$Xi
1061 vmovdqu 0x80-0x80($ctx),$t1
1064 vmovdqu 0xa0-0x80($ctx),$t2
1067 vmovdqu 0xc0-0x80($ctx),$t3
1070 vmovdqu 0xe0-0x80($ctx),$Xi
1074 vmovdqu $A,0x00-0x80($ctx)
1076 vmovdqu $B,0x20-0x80($ctx)
1078 vmovdqu $C,0x40-0x80($ctx)
1079 vmovdqu $D,0x60-0x80($ctx)
1080 vmovdqu $E,0x80-0x80($ctx)
1081 vmovdqu $F,0xa0-0x80($ctx)
1082 vmovdqu $G,0xc0-0x80($ctx)
1083 vmovdqu $H,0xe0-0x80($ctx)
1085 vmovdqu $sigma,(%rbx) # save counters
1086 vmovdqu .Lpbswap(%rip),$Xn
1090 mov `$REG_SZ*17+8`(%rsp),$num
1091 lea $REG_SZ($ctx),$ctx
1092 lea `16*$REG_SZ/4`($inp),$inp
1094 jnz .Loop_grande_avx
1097 mov `$REG_SZ*17`(%rsp),%rax # original %rsp
1101 $code.=<<___ if ($win64);
1102 movaps -0xb8(%rax),%xmm6
1103 movaps -0xa8(%rax),%xmm7
1104 movaps -0x98(%rax),%xmm8
1105 movaps -0x88(%rax),%xmm9
1106 movaps -0x78(%rax),%xmm10
1107 movaps -0x68(%rax),%xmm11
1108 movaps -0x58(%rax),%xmm12
1109 movaps -0x48(%rax),%xmm13
1110 movaps -0x38(%rax),%xmm14
1111 movaps -0x28(%rax),%xmm15
1119 .cfi_def_cfa_register %rsp
1123 .size sha256_multi_block_avx,.-sha256_multi_block_avx
1126 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1129 @ptr=map("%r$_",(12..15,8..11));
1131 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1132 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1135 .type sha256_multi_block_avx2,\@function,3
1137 sha256_multi_block_avx2:
1141 .cfi_def_cfa_register %rax
1155 $code.=<<___ if ($win64);
1156 lea -0xa8(%rsp),%rsp
1158 movaps %xmm7,0x10(%rsp)
1159 movaps %xmm8,0x20(%rsp)
1160 movaps %xmm9,0x30(%rsp)
1161 movaps %xmm10,0x40(%rsp)
1162 movaps %xmm11,0x50(%rsp)
1163 movaps %xmm12,-0x78(%rax)
1164 movaps %xmm13,-0x68(%rax)
1165 movaps %xmm14,-0x58(%rax)
1166 movaps %xmm15,-0x48(%rax)
1169 sub \$`$REG_SZ*18`, %rsp
1171 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
1172 .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
1174 lea K256+128(%rip),$Tbl
1175 lea 0x80($ctx),$ctx # size optimization
1178 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
1180 lea `$REG_SZ*16`(%rsp),%rbx
1182 for($i=0;$i<8;$i++) {
1184 mov `16*$i+0`($inp),@ptr[$i] # input pointer
1185 mov `16*$i+8`($inp),%ecx # number of blocks
1187 cmovg %ecx,$num # find maximum
1189 mov %ecx,`4*$i`(%rbx) # initialize counters
1190 cmovle $Tbl,@ptr[$i] # cancel input
1194 vmovdqu 0x00-0x80($ctx),$A # load context
1196 vmovdqu 0x20-0x80($ctx),$B
1197 lea 256+128(%rsp),%rbx
1198 vmovdqu 0x40-0x80($ctx),$C
1199 vmovdqu 0x60-0x80($ctx),$D
1200 vmovdqu 0x80-0x80($ctx),$E
1201 vmovdqu 0xa0-0x80($ctx),$F
1202 vmovdqu 0xc0-0x80($ctx),$G
1203 vmovdqu 0xe0-0x80($ctx),$H
1204 vmovdqu .Lpbswap(%rip),$Xn
1209 vpxor $B,$C,$bxc # magic seed
1211 for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1213 vmovdqu `&Xi_off($i)`,$Xi
1215 jmp .Loop_16_xx_avx2
1219 for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1222 jnz .Loop_16_xx_avx2
1225 lea `$REG_SZ*16`(%rsp),%rbx
1226 lea K256+128(%rip),$Tbl
1228 for($i=0;$i<8;$i++) {
1230 cmp `4*$i`(%rbx),%ecx # examine counters
1231 cmovge $Tbl,@ptr[$i] # cancel input
1235 vmovdqa (%rbx),$sigma # pull counters
1238 vpcmpgtd $t1,$Xn,$Xn # mask value
1239 vpaddd $Xn,$sigma,$sigma # counters--
1241 vmovdqu 0x00-0x80($ctx),$t1
1243 vmovdqu 0x20-0x80($ctx),$t2
1245 vmovdqu 0x40-0x80($ctx),$t3
1247 vmovdqu 0x60-0x80($ctx),$Xi
1250 vmovdqu 0x80-0x80($ctx),$t1
1253 vmovdqu 0xa0-0x80($ctx),$t2
1256 vmovdqu 0xc0-0x80($ctx),$t3
1259 vmovdqu 0xe0-0x80($ctx),$Xi
1263 vmovdqu $A,0x00-0x80($ctx)
1265 vmovdqu $B,0x20-0x80($ctx)
1267 vmovdqu $C,0x40-0x80($ctx)
1268 vmovdqu $D,0x60-0x80($ctx)
1269 vmovdqu $E,0x80-0x80($ctx)
1270 vmovdqu $F,0xa0-0x80($ctx)
1271 vmovdqu $G,0xc0-0x80($ctx)
1272 vmovdqu $H,0xe0-0x80($ctx)
1274 vmovdqu $sigma,(%rbx) # save counters
1275 lea 256+128(%rsp),%rbx
1276 vmovdqu .Lpbswap(%rip),$Xn
1280 #mov `$REG_SZ*17+8`(%rsp),$num
1281 #lea $REG_SZ($ctx),$ctx
1282 #lea `16*$REG_SZ/4`($inp),$inp
1284 #jnz .Loop_grande_avx2
1287 mov `$REG_SZ*17`(%rsp),%rax # original %rsp
1291 $code.=<<___ if ($win64);
1292 movaps -0xd8(%rax),%xmm6
1293 movaps -0xc8(%rax),%xmm7
1294 movaps -0xb8(%rax),%xmm8
1295 movaps -0xa8(%rax),%xmm9
1296 movaps -0x98(%rax),%xmm10
1297 movaps -0x88(%rax),%xmm11
1298 movaps -0x78(%rax),%xmm12
1299 movaps -0x68(%rax),%xmm13
1300 movaps -0x58(%rax),%xmm14
1301 movaps -0x48(%rax),%xmm15
1317 .cfi_def_cfa_register %rsp
1321 .size sha256_multi_block_avx2,.-sha256_multi_block_avx2
1336 &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1337 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1338 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1339 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1340 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1341 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1342 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1343 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1344 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1345 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1346 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1347 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1348 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1349 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1350 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1351 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1354 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1355 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1357 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1358 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1359 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1360 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1361 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1362 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1363 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1364 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1365 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1366 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1367 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1368 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1369 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1370 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1371 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1372 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1373 .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1377 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1378 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1385 .extern __imp_RtlVirtualUnwind
1386 .type se_handler,\@abi-omnipotent
1400 mov 120($context),%rax # pull context->Rax
1401 mov 248($context),%rbx # pull context->Rip
1403 mov 8($disp),%rsi # disp->ImageBase
1404 mov 56($disp),%r11 # disp->HandlerData
1406 mov 0(%r11),%r10d # HandlerData[0]
1407 lea (%rsi,%r10),%r10 # end of prologue label
1408 cmp %r10,%rbx # context->Rip<.Lbody
1411 mov 152($context),%rax # pull context->Rsp
1413 mov 4(%r11),%r10d # HandlerData[1]
1414 lea (%rsi,%r10),%r10 # epilogue label
1415 cmp %r10,%rbx # context->Rip>=.Lepilogue
1418 mov `16*17`(%rax),%rax # pull saved stack pointer
1422 mov %rbx,144($context) # restore context->Rbx
1423 mov %rbp,160($context) # restore context->Rbp
1425 lea -24-10*16(%rax),%rsi
1426 lea 512($context),%rdi # &context.Xmm6
1428 .long 0xa548f3fc # cld; rep movsq
1433 mov %rax,152($context) # restore context->Rsp
1434 mov %rsi,168($context) # restore context->Rsi
1435 mov %rdi,176($context) # restore context->Rdi
1437 mov 40($disp),%rdi # disp->ContextRecord
1438 mov $context,%rsi # context
1439 mov \$154,%ecx # sizeof(CONTEXT)
1440 .long 0xa548f3fc # cld; rep movsq
1443 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1444 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1445 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1446 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1447 mov 40(%rsi),%r10 # disp->ContextRecord
1448 lea 56(%rsi),%r11 # &disp->HandlerData
1449 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1450 mov %r10,32(%rsp) # arg5
1451 mov %r11,40(%rsp) # arg6
1452 mov %r12,48(%rsp) # arg7
1453 mov %rcx,56(%rsp) # arg8, (NULL)
1454 call *__imp_RtlVirtualUnwind(%rip)
1456 mov \$1,%eax # ExceptionContinueSearch
1468 .size se_handler,.-se_handler
1470 $code.=<<___ if ($avx>1);
1471 .type avx2_handler,\@abi-omnipotent
1485 mov 120($context),%rax # pull context->Rax
1486 mov 248($context),%rbx # pull context->Rip
1488 mov 8($disp),%rsi # disp->ImageBase
1489 mov 56($disp),%r11 # disp->HandlerData
1491 mov 0(%r11),%r10d # HandlerData[0]
1492 lea (%rsi,%r10),%r10 # end of prologue label
1493 cmp %r10,%rbx # context->Rip<body label
1496 mov 152($context),%rax # pull context->Rsp
1498 mov 4(%r11),%r10d # HandlerData[1]
1499 lea (%rsi,%r10),%r10 # epilogue label
1500 cmp %r10,%rbx # context->Rip>=epilogue label
1503 mov `32*17`($context),%rax # pull saved stack pointer
1511 mov %rbx,144($context) # restore context->Rbx
1512 mov %rbp,160($context) # restore context->Rbp
1513 mov %r12,216($context) # restore context->R12
1514 mov %r13,224($context) # restore context->R13
1515 mov %r14,232($context) # restore context->R14
1516 mov %r15,240($context) # restore context->R15
1518 lea -56-10*16(%rax),%rsi
1519 lea 512($context),%rdi # &context.Xmm6
1521 .long 0xa548f3fc # cld; rep movsq
1524 .size avx2_handler,.-avx2_handler
1529 .rva .LSEH_begin_sha256_multi_block
1530 .rva .LSEH_end_sha256_multi_block
1531 .rva .LSEH_info_sha256_multi_block
1532 .rva .LSEH_begin_sha256_multi_block_shaext
1533 .rva .LSEH_end_sha256_multi_block_shaext
1534 .rva .LSEH_info_sha256_multi_block_shaext
1536 $code.=<<___ if ($avx);
1537 .rva .LSEH_begin_sha256_multi_block_avx
1538 .rva .LSEH_end_sha256_multi_block_avx
1539 .rva .LSEH_info_sha256_multi_block_avx
1541 $code.=<<___ if ($avx>1);
1542 .rva .LSEH_begin_sha256_multi_block_avx2
1543 .rva .LSEH_end_sha256_multi_block_avx2
1544 .rva .LSEH_info_sha256_multi_block_avx2
1549 .LSEH_info_sha256_multi_block:
1552 .rva .Lbody,.Lepilogue # HandlerData[]
1553 .LSEH_info_sha256_multi_block_shaext:
1556 .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
1558 $code.=<<___ if ($avx);
1559 .LSEH_info_sha256_multi_block_avx:
1562 .rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
1564 $code.=<<___ if ($avx>1);
1565 .LSEH_info_sha256_multi_block_avx2:
1568 .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
1571 ####################################################################
1574 local *opcode=shift;
1578 $rex|=0x04 if ($dst>=8);
1579 $rex|=0x01 if ($src>=8);
1580 unshift @opcode,$rex|0x40 if ($rex);
1586 "sha256rnds2" => 0xcb,
1587 "sha256msg1" => 0xcc,
1588 "sha256msg2" => 0xcd );
1590 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1591 my @opcode=(0x0f,0x38);
1592 rex(\@opcode,$2,$1);
1593 push @opcode,$opcodelet{$instr};
1594 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1595 return ".byte\t".join(',',@opcode);
1597 return $instr."\t".@_[0];
1601 foreach (split("\n",$code)) {
1602 s/\`([^\`]*)\`/eval($1)/ge;
1604 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or
1606 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1607 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
1608 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
1609 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1610 s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
1611 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;