2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # Multi-buffer SHA256 procedure processes n buffers in parallel by
18 # placing buffer data to designated lane of SIMD register. n is
19 # naturally limited to 4 on pre-AVX2 processors and to 8 on
20 # AVX2-capable processors such as Haswell.
22 # this +aesni(i) sha256 aesni-sha256 gain(iv)
23 # -------------------------------------------------------------------
24 # Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
25 # Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95%
26 # Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
27 # Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
28 # Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
29 # Skylake (18.9 +5.00=23.9)/n 7.70 8.17 +170%
30 # Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100%
32 # (i) multi-block CBC encrypt with 128-bit key;
33 # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
34 # because of lower AES-NI instruction throughput, nor is there
35 # AES-NI-SHA256 stitch for these processors;
36 # (iii) "this" is for n=8, when we gather twice as much data, result
37 # for n=4 is 20.3+4.44=24.7;
38 # (iv) presented improvement coefficients are asymptotic limits and
39 # in real-life application are somewhat lower, e.g. for 2KB
40 # fragments they range from 75% to 130% (on Haswell);
44 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
50 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
51 die "can't locate x86_64-xlate.pl";
55 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
56 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
57 $avx = ($1>=2.19) + ($1>=2.22);
60 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
61 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
62 $avx = ($1>=2.09) + ($1>=2.10);
65 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
66 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
67 $avx = ($1>=10) + ($1>=11);
70 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
71 $avx = ($2>=3.0) + ($2>3.0);
74 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
77 # void sha256_multi_block (
78 # struct { unsigned int A[8];
85 # unsigned int H[8]; } *ctx,
86 # struct { void *ptr; int blocks; } inp[8],
87 # int num); /* 1 or 2 */
89 $ctx="%rdi"; # 1st arg
90 $inp="%rsi"; # 2nd arg
91 $num="%edx"; # 3rd arg
92 @ptr=map("%r$_",(8..11));
95 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
96 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
103 $off %= 16; $off *= $REG_SZ;
104 $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
108 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
110 $code.=<<___ if ($i<15);
111 movd `4*$i`(@ptr[0]),$Xi
112 movd `4*$i`(@ptr[1]),$t1
113 movd `4*$i`(@ptr[2]),$t2
114 movd `4*$i`(@ptr[3]),$t3
119 $code.=<<___ if ($i==15);
120 movd `4*$i`(@ptr[0]),$Xi
121 lea `16*4`(@ptr[0]),@ptr[0]
122 movd `4*$i`(@ptr[1]),$t1
123 lea `16*4`(@ptr[1]),@ptr[1]
124 movd `4*$i`(@ptr[2]),$t2
125 lea `16*4`(@ptr[2]),@ptr[2]
126 movd `4*$i`(@ptr[3]),$t3
127 lea `16*4`(@ptr[3]),@ptr[3]
134 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)`
136 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)`
140 movdqa $Xi,`&Xi_off($i)`
146 paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round]
151 `"prefetcht0 63(@ptr[0])" if ($i==15)`
153 movdqa $e,$axb # borrow $axb
159 `"prefetcht0 63(@ptr[1])" if ($i==15)`
161 pxor $t3,$sigma # Sigma1(e)
164 paddd $sigma,$Xi # Xi+=Sigma1(e)
165 pxor $axb,$t1 # Ch(e,f,g)
169 pxor $a,$axb # a^b, b^c in next round
171 `"prefetcht0 63(@ptr[2])" if ($i==15)`
174 paddd $t1,$Xi # Xi+=Ch(e,f,g)
179 `"prefetcht0 63(@ptr[3])" if ($i==15)`
185 pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
187 pxor $t3,$sigma # Sigma0(a)
190 paddd $sigma,$h # h+=Sigma0(a)
192 $code.=<<___ if (($i%8)==7);
193 lea `32*8`($Tbl),$Tbl
195 ($axb,$bxc)=($bxc,$axb);
202 movdqa `&Xi_off($i+1)`,$Xn
203 paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9]
211 movdqa `&Xi_off($i+14)`,$t1
215 movdqa $t1,$axb # borrow $axb
223 pxor $t3,$sigma # sigma0(X[i+1])
225 paddd $sigma,$Xi # Xi+=sigma0(e)
231 pxor $t2,$t1 # sigma0(X[i+14])
232 paddd $t1,$Xi # Xi+=sigma1(X[i+14])
241 .extern OPENSSL_ia32cap_P
243 .globl sha256_multi_block
244 .type sha256_multi_block,\@function,3
248 mov OPENSSL_ia32cap_P+4(%rip),%rcx
249 bt \$61,%rcx # check SHA bit
252 $code.=<<___ if ($avx);
258 .cfi_def_cfa_register %rax
264 $code.=<<___ if ($win64);
267 movaps %xmm7,0x10(%rsp)
268 movaps %xmm8,0x20(%rsp)
269 movaps %xmm9,0x30(%rsp)
270 movaps %xmm10,-0x78(%rax)
271 movaps %xmm11,-0x68(%rax)
272 movaps %xmm12,-0x58(%rax)
273 movaps %xmm13,-0x48(%rax)
274 movaps %xmm14,-0x38(%rax)
275 movaps %xmm15,-0x28(%rax)
278 sub \$`$REG_SZ*18`, %rsp
280 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
281 .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
283 lea K256+128(%rip),$Tbl
284 lea `$REG_SZ*16`(%rsp),%rbx
285 lea 0x80($ctx),$ctx # size optimization
288 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
291 for($i=0;$i<4;$i++) {
293 mov `16*$i+0`($inp),@ptr[$i] # input pointer
294 mov `16*$i+8`($inp),%ecx # number of blocks
296 cmovg %ecx,$num # find maximum
298 mov %ecx,`4*$i`(%rbx) # initialize counters
299 cmovle $Tbl,@ptr[$i] # cancel input
306 movdqu 0x00-0x80($ctx),$A # load context
308 movdqu 0x20-0x80($ctx),$B
309 movdqu 0x40-0x80($ctx),$C
310 movdqu 0x60-0x80($ctx),$D
311 movdqu 0x80-0x80($ctx),$E
312 movdqu 0xa0-0x80($ctx),$F
313 movdqu 0xc0-0x80($ctx),$G
314 movdqu 0xe0-0x80($ctx),$H
315 movdqu .Lpbswap(%rip),$Xn
321 pxor $B,$bxc # magic seed
323 for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
325 movdqu `&Xi_off($i)`,$Xi
331 for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
337 lea K256+128(%rip),$Tbl
339 movdqa (%rbx),$sigma # pull counters
340 cmp 4*0(%rbx),%ecx # examine counters
342 cmovge $Tbl,@ptr[0] # cancel input
347 pcmpgtd $t1,$Xn # mask value
350 paddd $Xn,$sigma # counters--
353 movdqu 0x00-0x80($ctx),$t1
355 movdqu 0x20-0x80($ctx),$t2
357 movdqu 0x40-0x80($ctx),$t3
359 movdqu 0x60-0x80($ctx),$Xi
362 movdqu 0x80-0x80($ctx),$t1
365 movdqu 0xa0-0x80($ctx),$t2
368 movdqu 0xc0-0x80($ctx),$t3
371 movdqu 0xe0-0x80($ctx),$Xi
375 movdqu $A,0x00-0x80($ctx)
377 movdqu $B,0x20-0x80($ctx)
379 movdqu $C,0x40-0x80($ctx)
380 movdqu $D,0x60-0x80($ctx)
381 movdqu $E,0x80-0x80($ctx)
382 movdqu $F,0xa0-0x80($ctx)
383 movdqu $G,0xc0-0x80($ctx)
384 movdqu $H,0xe0-0x80($ctx)
386 movdqa $sigma,(%rbx) # save counters
387 movdqa .Lpbswap(%rip),$Xn
391 mov `$REG_SZ*17+8`(%rsp),$num
392 lea $REG_SZ($ctx),$ctx
393 lea `16*$REG_SZ/4`($inp),$inp
398 mov `$REG_SZ*17`(%rsp),%rax # original %rsp
401 $code.=<<___ if ($win64);
402 movaps -0xb8(%rax),%xmm6
403 movaps -0xa8(%rax),%xmm7
404 movaps -0x98(%rax),%xmm8
405 movaps -0x88(%rax),%xmm9
406 movaps -0x78(%rax),%xmm10
407 movaps -0x68(%rax),%xmm11
408 movaps -0x58(%rax),%xmm12
409 movaps -0x48(%rax),%xmm13
410 movaps -0x38(%rax),%xmm14
411 movaps -0x28(%rax),%xmm15
419 .cfi_def_cfa_register %rsp
423 .size sha256_multi_block,.-sha256_multi_block
426 my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
427 my @MSG0=map("%xmm$_",(4..7));
428 my @MSG1=map("%xmm$_",(8..11));
431 .type sha256_multi_block_shaext,\@function,3
433 sha256_multi_block_shaext:
437 .cfi_def_cfa_register %rax
443 $code.=<<___ if ($win64);
446 movaps %xmm7,0x10(%rsp)
447 movaps %xmm8,0x20(%rsp)
448 movaps %xmm9,0x30(%rsp)
449 movaps %xmm10,-0x78(%rax)
450 movaps %xmm11,-0x68(%rax)
451 movaps %xmm12,-0x58(%rax)
452 movaps %xmm13,-0x48(%rax)
453 movaps %xmm14,-0x38(%rax)
454 movaps %xmm15,-0x28(%rax)
457 sub \$`$REG_SZ*18`,%rsp
458 shl \$1,$num # we process pair at a time
460 lea 0x80($ctx),$ctx # size optimization
461 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
463 lea `$REG_SZ*16`(%rsp),%rbx
464 lea K256_shaext+0x80(%rip),$Tbl
467 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
470 for($i=0;$i<2;$i++) {
472 mov `16*$i+0`($inp),@ptr[$i] # input pointer
473 mov `16*$i+8`($inp),%ecx # number of blocks
475 cmovg %ecx,$num # find maximum
477 mov %ecx,`4*$i`(%rbx) # initialize counters
478 cmovle %rsp,@ptr[$i] # cancel input
485 movq 0x00-0x80($ctx),$ABEF0 # A1.A0
486 movq 0x20-0x80($ctx),@MSG0[0] # B1.B0
487 movq 0x40-0x80($ctx),$CDGH0 # C1.C0
488 movq 0x60-0x80($ctx),@MSG0[1] # D1.D0
489 movq 0x80-0x80($ctx),@MSG1[0] # E1.E0
490 movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0
491 movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0
492 movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0
494 punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0
495 punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0
496 punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0
497 punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0
498 movdqa K256_shaext-0x10(%rip),$TMPx # byte swap
502 punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0
503 punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0
504 punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1
505 punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1
507 pshufd \$0b00011011,$ABEF0,$ABEF0
508 pshufd \$0b00011011,$CDGH0,$CDGH0
509 pshufd \$0b00011011,$ABEF1,$ABEF1
510 pshufd \$0b00011011,$CDGH1,$CDGH1
515 movdqu 0x00(@ptr[0]),@MSG0[0]
516 movdqu 0x00(@ptr[1]),@MSG1[0]
517 movdqu 0x10(@ptr[0]),@MSG0[1]
518 movdqu 0x10(@ptr[1]),@MSG1[1]
519 movdqu 0x20(@ptr[0]),@MSG0[2]
520 pshufb $TMPx,@MSG0[0]
521 movdqu 0x20(@ptr[1]),@MSG1[2]
522 pshufb $TMPx,@MSG1[0]
523 movdqu 0x30(@ptr[0]),@MSG0[3]
524 lea 0x40(@ptr[0]),@ptr[0]
525 movdqu 0x30(@ptr[1]),@MSG1[3]
526 lea 0x40(@ptr[1]),@ptr[1]
528 movdqa 0*16-0x80($Tbl),$Wi
529 pshufb $TMPx,@MSG0[1]
531 pxor $ABEF0,@MSG0[0] # black magic
533 movdqa 0*16-0x80($Tbl),$TMP1
534 pshufb $TMPx,@MSG1[1]
536 movdqa $CDGH0,0x50(%rsp) # offload
537 sha256rnds2 $ABEF0,$CDGH0 # 0-3
538 pxor $ABEF1,@MSG1[0] # black magic
540 movdqa $CDGH1,0x70(%rsp)
541 sha256rnds2 $ABEF1,$CDGH1 # 0-3
542 pshufd \$0x0e,$TMP0,$Wi
543 pxor $ABEF0,@MSG0[0] # black magic
544 movdqa $ABEF0,0x40(%rsp) # offload
545 sha256rnds2 $CDGH0,$ABEF0
546 pshufd \$0x0e,$TMP1,$Wi
547 pxor $ABEF1,@MSG1[0] # black magic
548 movdqa $ABEF1,0x60(%rsp)
549 movdqa 1*16-0x80($Tbl),$TMP0
551 pshufb $TMPx,@MSG0[2]
552 sha256rnds2 $CDGH1,$ABEF1
555 movdqa 1*16-0x80($Tbl),$TMP1
557 sha256rnds2 $ABEF0,$CDGH0 # 4-7
559 prefetcht0 127(@ptr[0])
560 pshufb $TMPx,@MSG0[3]
561 pshufb $TMPx,@MSG1[2]
562 prefetcht0 127(@ptr[1])
563 sha256rnds2 $ABEF1,$CDGH1 # 4-7
564 pshufd \$0x0e,$TMP0,$Wi
565 pshufb $TMPx,@MSG1[3]
566 sha256msg1 @MSG0[1],@MSG0[0]
567 sha256rnds2 $CDGH0,$ABEF0
568 pshufd \$0x0e,$TMP1,$Wi
569 movdqa 2*16-0x80($Tbl),$TMP0
571 sha256rnds2 $CDGH1,$ABEF1
574 movdqa 2*16-0x80($Tbl),$TMP1
576 sha256rnds2 $ABEF0,$CDGH0 # 8-11
577 sha256msg1 @MSG1[1],@MSG1[0]
579 movdqa @MSG0[3],$TMPx
580 sha256rnds2 $ABEF1,$CDGH1 # 8-11
581 pshufd \$0x0e,$TMP0,$Wi
582 palignr \$4,@MSG0[2],$TMPx
584 movdqa @MSG1[3],$TMPx
585 palignr \$4,@MSG1[2],$TMPx
586 sha256msg1 @MSG0[2],@MSG0[1]
587 sha256rnds2 $CDGH0,$ABEF0
588 pshufd \$0x0e,$TMP1,$Wi
589 movdqa 3*16-0x80($Tbl),$TMP0
591 sha256rnds2 $CDGH1,$ABEF1
592 sha256msg1 @MSG1[2],@MSG1[1]
595 movdqa 3*16-0x80($Tbl),$TMP1
598 sha256msg2 @MSG0[3],@MSG0[0]
599 sha256rnds2 $ABEF0,$CDGH0 # 12-15
601 movdqa @MSG0[0],$TMPx
602 palignr \$4,@MSG0[3],$TMPx
603 sha256rnds2 $ABEF1,$CDGH1 # 12-15
604 sha256msg2 @MSG1[3],@MSG1[0]
605 pshufd \$0x0e,$TMP0,$Wi
607 movdqa @MSG1[0],$TMPx
608 palignr \$4,@MSG1[3],$TMPx
609 sha256msg1 @MSG0[3],@MSG0[2]
610 sha256rnds2 $CDGH0,$ABEF0
611 pshufd \$0x0e,$TMP1,$Wi
612 movdqa 4*16-0x80($Tbl),$TMP0
614 sha256rnds2 $CDGH1,$ABEF1
615 sha256msg1 @MSG1[3],@MSG1[2]
617 for($i=4;$i<16-3;$i++) {
620 movdqa $i*16-0x80($Tbl),$TMP1
623 sha256msg2 @MSG0[0],@MSG0[1]
624 sha256rnds2 $ABEF0,$CDGH0 # 16-19...
626 movdqa @MSG0[1],$TMPx
627 palignr \$4,@MSG0[0],$TMPx
628 sha256rnds2 $ABEF1,$CDGH1 # 16-19...
629 sha256msg2 @MSG1[0],@MSG1[1]
630 pshufd \$0x0e,$TMP0,$Wi
632 movdqa @MSG1[1],$TMPx
633 palignr \$4,@MSG1[0],$TMPx
634 sha256msg1 @MSG0[0],@MSG0[3]
635 sha256rnds2 $CDGH0,$ABEF0
636 pshufd \$0x0e,$TMP1,$Wi
637 movdqa `($i+1)*16`-0x80($Tbl),$TMP0
639 sha256rnds2 $CDGH1,$ABEF1
640 sha256msg1 @MSG1[0],@MSG1[3]
642 push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
646 movdqa 13*16-0x80($Tbl),$TMP1
649 sha256msg2 @MSG0[0],@MSG0[1]
650 sha256rnds2 $ABEF0,$CDGH0 # 52-55
652 movdqa @MSG0[1],$TMPx
653 palignr \$4,@MSG0[0],$TMPx
654 sha256rnds2 $ABEF1,$CDGH1 # 52-55
655 sha256msg2 @MSG1[0],@MSG1[1]
656 pshufd \$0x0e,$TMP0,$Wi
658 movdqa @MSG1[1],$TMPx
659 palignr \$4,@MSG1[0],$TMPx
661 sha256rnds2 $CDGH0,$ABEF0
662 pshufd \$0x0e,$TMP1,$Wi
663 movdqa 14*16-0x80($Tbl),$TMP0
665 sha256rnds2 $CDGH1,$ABEF1
668 movdqa 14*16-0x80($Tbl),$TMP1
671 sha256msg2 @MSG0[1],@MSG0[2]
673 sha256rnds2 $ABEF0,$CDGH0 # 56-59
676 pxor @MSG0[1],@MSG0[1] # zero
677 sha256rnds2 $ABEF1,$CDGH1 # 56-59
678 sha256msg2 @MSG1[1],@MSG1[2]
679 pshufd \$0x0e,$TMP0,$Wi
680 movdqa 15*16-0x80($Tbl),$TMP0
682 movq (%rbx),@MSG0[2] # pull counters
684 sha256rnds2 $CDGH0,$ABEF0
685 pshufd \$0x0e,$TMP1,$Wi
686 movdqa 15*16-0x80($Tbl),$TMP1
688 sha256rnds2 $CDGH1,$ABEF1
691 cmp 4*0(%rbx),%ecx # examine counters
692 cmovge %rsp,@ptr[0] # cancel input
695 pshufd \$0x00,@MSG0[2],@MSG1[0]
696 sha256rnds2 $ABEF0,$CDGH0 # 60-63
698 pshufd \$0x55,@MSG0[2],@MSG1[1]
699 movdqa @MSG0[2],@MSG1[2]
700 sha256rnds2 $ABEF1,$CDGH1 # 60-63
701 pshufd \$0x0e,$TMP0,$Wi
702 pcmpgtd @MSG0[1],@MSG1[0]
703 pcmpgtd @MSG0[1],@MSG1[1]
704 sha256rnds2 $CDGH0,$ABEF0
705 pshufd \$0x0e,$TMP1,$Wi
706 pcmpgtd @MSG0[1],@MSG1[2] # counter mask
707 movdqa K256_shaext-0x10(%rip),$TMPx
708 sha256rnds2 $CDGH1,$ABEF1
714 paddd @MSG0[2],@MSG1[2] # counters--
716 paddd 0x50(%rsp),$CDGH0
717 paddd 0x70(%rsp),$CDGH1
718 paddd 0x40(%rsp),$ABEF0
719 paddd 0x60(%rsp),$ABEF1
721 movq @MSG1[2],(%rbx) # save counters
725 mov `$REG_SZ*17+8`(%rsp),$num
727 pshufd \$0b00011011,$ABEF0,$ABEF0
728 pshufd \$0b00011011,$CDGH0,$CDGH0
729 pshufd \$0b00011011,$ABEF1,$ABEF1
730 pshufd \$0b00011011,$CDGH1,$CDGH1
732 movdqa $ABEF0,@MSG0[0]
733 movdqa $CDGH0,@MSG0[1]
734 punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0
735 punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0
736 punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0
737 punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0
739 movq $ABEF0,0x00-0x80($ctx) # A1.A0
741 movq @MSG0[0],0x80-0x80($ctx) # E1.E0
743 movq $ABEF0,0x20-0x80($ctx) # B1.B0
744 movq @MSG0[0],0xa0-0x80($ctx) # F1.F0
746 movq $CDGH0,0x40-0x80($ctx) # C1.C0
748 movq @MSG0[1],0xc0-0x80($ctx) # G1.G0
750 movq $CDGH0,0x60-0x80($ctx) # D1.D0
751 movq @MSG0[1],0xe0-0x80($ctx) # H1.H0
753 lea `$REG_SZ/2`($ctx),$ctx
754 lea `16*2`($inp),$inp
756 jnz .Loop_grande_shaext
759 #mov `$REG_SZ*17`(%rsp),%rax # original %rsp
761 $code.=<<___ if ($win64);
762 movaps -0xb8(%rax),%xmm6
763 movaps -0xa8(%rax),%xmm7
764 movaps -0x98(%rax),%xmm8
765 movaps -0x88(%rax),%xmm9
766 movaps -0x78(%rax),%xmm10
767 movaps -0x68(%rax),%xmm11
768 movaps -0x58(%rax),%xmm12
769 movaps -0x48(%rax),%xmm13
770 movaps -0x38(%rax),%xmm14
771 movaps -0x28(%rax),%xmm15
779 .cfi_def_cfa_register %rsp
783 .size sha256_multi_block_shaext,.-sha256_multi_block_shaext
787 sub ROUND_00_15_avx {
788 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
790 $code.=<<___ if ($i<15 && $REG_SZ==16);
791 vmovd `4*$i`(@ptr[0]),$Xi
792 vmovd `4*$i`(@ptr[1]),$t1
793 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
794 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
795 vpunpckldq $t1,$Xi,$Xi
798 $code.=<<___ if ($i==15 && $REG_SZ==16);
799 vmovd `4*$i`(@ptr[0]),$Xi
800 lea `16*4`(@ptr[0]),@ptr[0]
801 vmovd `4*$i`(@ptr[1]),$t1
802 lea `16*4`(@ptr[1]),@ptr[1]
803 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
804 lea `16*4`(@ptr[2]),@ptr[2]
805 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
806 lea `16*4`(@ptr[3]),@ptr[3]
807 vpunpckldq $t1,$Xi,$Xi
810 $code.=<<___ if ($i<15 && $REG_SZ==32);
811 vmovd `4*$i`(@ptr[0]),$Xi
812 vmovd `4*$i`(@ptr[4]),$t1
813 vmovd `4*$i`(@ptr[1]),$t2
814 vmovd `4*$i`(@ptr[5]),$t3
815 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
816 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
817 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
818 vpunpckldq $t2,$Xi,$Xi
819 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
820 vpunpckldq $t3,$t1,$t1
821 vinserti128 $t1,$Xi,$Xi
824 $code.=<<___ if ($i==15 && $REG_SZ==32);
825 vmovd `4*$i`(@ptr[0]),$Xi
826 lea `16*4`(@ptr[0]),@ptr[0]
827 vmovd `4*$i`(@ptr[4]),$t1
828 lea `16*4`(@ptr[4]),@ptr[4]
829 vmovd `4*$i`(@ptr[1]),$t2
830 lea `16*4`(@ptr[1]),@ptr[1]
831 vmovd `4*$i`(@ptr[5]),$t3
832 lea `16*4`(@ptr[5]),@ptr[5]
833 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
834 lea `16*4`(@ptr[2]),@ptr[2]
835 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
836 lea `16*4`(@ptr[6]),@ptr[6]
837 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
838 lea `16*4`(@ptr[3]),@ptr[3]
839 vpunpckldq $t2,$Xi,$Xi
840 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
841 lea `16*4`(@ptr[7]),@ptr[7]
842 vpunpckldq $t3,$t1,$t1
843 vinserti128 $t1,$Xi,$Xi
849 vmovdqu $Xi,`&Xi_off($i)`
850 vpaddd $h,$Xi,$Xi # Xi+=h
853 vpxor $t3,$sigma,$sigma
855 vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round]
856 vpxor $t2,$sigma,$sigma
859 vpxor $t3,$sigma,$sigma
860 `"prefetcht0 63(@ptr[0])" if ($i==15)`
863 vpand $f,$e,$axb # borrow $axb
864 `"prefetcht0 63(@ptr[1])" if ($i==15)`
865 vpxor $t2,$sigma,$sigma
867 vpsrld \$2,$a,$h # borrow $h
868 vpxor $t3,$sigma,$sigma # Sigma1(e)
869 `"prefetcht0 63(@ptr[2])" if ($i==15)`
871 vpxor $axb,$t1,$t1 # Ch(e,f,g)
872 vpxor $a,$b,$axb # a^b, b^c in next round
873 `"prefetcht0 63(@ptr[3])" if ($i==15)`
875 vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
878 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
880 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
882 `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
886 vpxor $t3,$sigma,$sigma
887 `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
889 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
890 vpaddd $Xi,$d,$d # d+=Xi
891 `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
892 vpxor $t2,$sigma,$sigma
893 vpxor $t3,$sigma,$sigma # Sigma0(a)
895 vpaddd $Xi,$h,$h # h+=Xi
896 vpaddd $sigma,$h,$h # h+=Sigma0(a)
898 $code.=<<___ if (($i%8)==7);
901 ($axb,$bxc)=($bxc,$axb);
904 sub ROUND_16_XX_avx {
908 vmovdqu `&Xi_off($i+1)`,$Xn
909 vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9]
911 vpsrld \$3,$Xn,$sigma
914 vpxor $t2,$sigma,$sigma
916 vpxor $t3,$sigma,$sigma
918 vmovdqu `&Xi_off($i+14)`,$t1
919 vpsrld \$10,$t1,$axb # borrow $axb
921 vpxor $t2,$sigma,$sigma
923 vpxor $t3,$sigma,$sigma # sigma0(X[i+1])
925 vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e)
926 vpxor $t2,$axb,$sigma
928 vpxor $t3,$sigma,$sigma
930 vpxor $t2,$sigma,$sigma
931 vpxor $t3,$sigma,$sigma # sigma0(X[i+14])
932 vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14])
934 &ROUND_00_15_avx($i,@_);
939 .type sha256_multi_block_avx,\@function,3
941 sha256_multi_block_avx:
945 $code.=<<___ if ($avx>1);
957 .cfi_def_cfa_register %rax
963 $code.=<<___ if ($win64);
966 movaps %xmm7,0x10(%rsp)
967 movaps %xmm8,0x20(%rsp)
968 movaps %xmm9,0x30(%rsp)
969 movaps %xmm10,-0x78(%rax)
970 movaps %xmm11,-0x68(%rax)
971 movaps %xmm12,-0x58(%rax)
972 movaps %xmm13,-0x48(%rax)
973 movaps %xmm14,-0x38(%rax)
974 movaps %xmm15,-0x28(%rax)
977 sub \$`$REG_SZ*18`, %rsp
979 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
980 .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
982 lea K256+128(%rip),$Tbl
983 lea `$REG_SZ*16`(%rsp),%rbx
984 lea 0x80($ctx),$ctx # size optimization
987 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
990 for($i=0;$i<4;$i++) {
992 mov `16*$i+0`($inp),@ptr[$i] # input pointer
993 mov `16*$i+8`($inp),%ecx # number of blocks
995 cmovg %ecx,$num # find maximum
997 mov %ecx,`4*$i`(%rbx) # initialize counters
998 cmovle $Tbl,@ptr[$i] # cancel input
1005 vmovdqu 0x00-0x80($ctx),$A # load context
1007 vmovdqu 0x20-0x80($ctx),$B
1008 vmovdqu 0x40-0x80($ctx),$C
1009 vmovdqu 0x60-0x80($ctx),$D
1010 vmovdqu 0x80-0x80($ctx),$E
1011 vmovdqu 0xa0-0x80($ctx),$F
1012 vmovdqu 0xc0-0x80($ctx),$G
1013 vmovdqu 0xe0-0x80($ctx),$H
1014 vmovdqu .Lpbswap(%rip),$Xn
1019 vpxor $B,$C,$bxc # magic seed
1021 for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1023 vmovdqu `&Xi_off($i)`,$Xi
1029 for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1035 lea K256+128(%rip),$Tbl
1037 for($i=0;$i<4;$i++) {
1039 cmp `4*$i`(%rbx),%ecx # examine counters
1040 cmovge $Tbl,@ptr[$i] # cancel input
1044 vmovdqa (%rbx),$sigma # pull counters
1047 vpcmpgtd $t1,$Xn,$Xn # mask value
1048 vpaddd $Xn,$sigma,$sigma # counters--
1050 vmovdqu 0x00-0x80($ctx),$t1
1052 vmovdqu 0x20-0x80($ctx),$t2
1054 vmovdqu 0x40-0x80($ctx),$t3
1056 vmovdqu 0x60-0x80($ctx),$Xi
1059 vmovdqu 0x80-0x80($ctx),$t1
1062 vmovdqu 0xa0-0x80($ctx),$t2
1065 vmovdqu 0xc0-0x80($ctx),$t3
1068 vmovdqu 0xe0-0x80($ctx),$Xi
1072 vmovdqu $A,0x00-0x80($ctx)
1074 vmovdqu $B,0x20-0x80($ctx)
1076 vmovdqu $C,0x40-0x80($ctx)
1077 vmovdqu $D,0x60-0x80($ctx)
1078 vmovdqu $E,0x80-0x80($ctx)
1079 vmovdqu $F,0xa0-0x80($ctx)
1080 vmovdqu $G,0xc0-0x80($ctx)
1081 vmovdqu $H,0xe0-0x80($ctx)
1083 vmovdqu $sigma,(%rbx) # save counters
1084 vmovdqu .Lpbswap(%rip),$Xn
1088 mov `$REG_SZ*17+8`(%rsp),$num
1089 lea $REG_SZ($ctx),$ctx
1090 lea `16*$REG_SZ/4`($inp),$inp
1092 jnz .Loop_grande_avx
1095 mov `$REG_SZ*17`(%rsp),%rax # original %rsp
1099 $code.=<<___ if ($win64);
1100 movaps -0xb8(%rax),%xmm6
1101 movaps -0xa8(%rax),%xmm7
1102 movaps -0x98(%rax),%xmm8
1103 movaps -0x88(%rax),%xmm9
1104 movaps -0x78(%rax),%xmm10
1105 movaps -0x68(%rax),%xmm11
1106 movaps -0x58(%rax),%xmm12
1107 movaps -0x48(%rax),%xmm13
1108 movaps -0x38(%rax),%xmm14
1109 movaps -0x28(%rax),%xmm15
1117 .cfi_def_cfa_register %rsp
1121 .size sha256_multi_block_avx,.-sha256_multi_block_avx
1124 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1127 @ptr=map("%r$_",(12..15,8..11));
1129 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1130 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1133 .type sha256_multi_block_avx2,\@function,3
1135 sha256_multi_block_avx2:
1139 .cfi_def_cfa_register %rax
1153 $code.=<<___ if ($win64);
1154 lea -0xa8(%rsp),%rsp
1156 movaps %xmm7,0x10(%rsp)
1157 movaps %xmm8,0x20(%rsp)
1158 movaps %xmm9,0x30(%rsp)
1159 movaps %xmm10,0x40(%rsp)
1160 movaps %xmm11,0x50(%rsp)
1161 movaps %xmm12,-0x78(%rax)
1162 movaps %xmm13,-0x68(%rax)
1163 movaps %xmm14,-0x58(%rax)
1164 movaps %xmm15,-0x48(%rax)
1167 sub \$`$REG_SZ*18`, %rsp
1169 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
1170 .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8
1172 lea K256+128(%rip),$Tbl
1173 lea 0x80($ctx),$ctx # size optimization
1176 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
1178 lea `$REG_SZ*16`(%rsp),%rbx
1180 for($i=0;$i<8;$i++) {
1182 mov `16*$i+0`($inp),@ptr[$i] # input pointer
1183 mov `16*$i+8`($inp),%ecx # number of blocks
1185 cmovg %ecx,$num # find maximum
1187 mov %ecx,`4*$i`(%rbx) # initialize counters
1188 cmovle $Tbl,@ptr[$i] # cancel input
1192 vmovdqu 0x00-0x80($ctx),$A # load context
1194 vmovdqu 0x20-0x80($ctx),$B
1195 lea 256+128(%rsp),%rbx
1196 vmovdqu 0x40-0x80($ctx),$C
1197 vmovdqu 0x60-0x80($ctx),$D
1198 vmovdqu 0x80-0x80($ctx),$E
1199 vmovdqu 0xa0-0x80($ctx),$F
1200 vmovdqu 0xc0-0x80($ctx),$G
1201 vmovdqu 0xe0-0x80($ctx),$H
1202 vmovdqu .Lpbswap(%rip),$Xn
1207 vpxor $B,$C,$bxc # magic seed
1209 for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1211 vmovdqu `&Xi_off($i)`,$Xi
1213 jmp .Loop_16_xx_avx2
1217 for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1220 jnz .Loop_16_xx_avx2
1223 lea `$REG_SZ*16`(%rsp),%rbx
1224 lea K256+128(%rip),$Tbl
1226 for($i=0;$i<8;$i++) {
1228 cmp `4*$i`(%rbx),%ecx # examine counters
1229 cmovge $Tbl,@ptr[$i] # cancel input
1233 vmovdqa (%rbx),$sigma # pull counters
1236 vpcmpgtd $t1,$Xn,$Xn # mask value
1237 vpaddd $Xn,$sigma,$sigma # counters--
1239 vmovdqu 0x00-0x80($ctx),$t1
1241 vmovdqu 0x20-0x80($ctx),$t2
1243 vmovdqu 0x40-0x80($ctx),$t3
1245 vmovdqu 0x60-0x80($ctx),$Xi
1248 vmovdqu 0x80-0x80($ctx),$t1
1251 vmovdqu 0xa0-0x80($ctx),$t2
1254 vmovdqu 0xc0-0x80($ctx),$t3
1257 vmovdqu 0xe0-0x80($ctx),$Xi
1261 vmovdqu $A,0x00-0x80($ctx)
1263 vmovdqu $B,0x20-0x80($ctx)
1265 vmovdqu $C,0x40-0x80($ctx)
1266 vmovdqu $D,0x60-0x80($ctx)
1267 vmovdqu $E,0x80-0x80($ctx)
1268 vmovdqu $F,0xa0-0x80($ctx)
1269 vmovdqu $G,0xc0-0x80($ctx)
1270 vmovdqu $H,0xe0-0x80($ctx)
1272 vmovdqu $sigma,(%rbx) # save counters
1273 lea 256+128(%rsp),%rbx
1274 vmovdqu .Lpbswap(%rip),$Xn
1278 #mov `$REG_SZ*17+8`(%rsp),$num
1279 #lea $REG_SZ($ctx),$ctx
1280 #lea `16*$REG_SZ/4`($inp),$inp
1282 #jnz .Loop_grande_avx2
1285 mov `$REG_SZ*17`(%rsp),%rax # original %rsp
1289 $code.=<<___ if ($win64);
1290 movaps -0xd8(%rax),%xmm6
1291 movaps -0xc8(%rax),%xmm7
1292 movaps -0xb8(%rax),%xmm8
1293 movaps -0xa8(%rax),%xmm9
1294 movaps -0x98(%rax),%xmm10
1295 movaps -0x88(%rax),%xmm11
1296 movaps -0x78(%rax),%xmm12
1297 movaps -0x68(%rax),%xmm13
1298 movaps -0x58(%rax),%xmm14
1299 movaps -0x48(%rax),%xmm15
1315 .cfi_def_cfa_register %rsp
1319 .size sha256_multi_block_avx2,.-sha256_multi_block_avx2
1334 &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1335 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1336 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1337 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1338 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1339 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1340 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1341 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1342 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1343 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1344 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1345 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1346 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1347 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1348 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1349 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1352 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1353 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1355 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1356 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1357 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1358 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1359 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1360 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1361 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1362 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1363 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1364 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1365 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1366 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1367 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1368 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1369 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1370 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1371 .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1375 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1376 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1383 .extern __imp_RtlVirtualUnwind
1384 .type se_handler,\@abi-omnipotent
1398 mov 120($context),%rax # pull context->Rax
1399 mov 248($context),%rbx # pull context->Rip
1401 mov 8($disp),%rsi # disp->ImageBase
1402 mov 56($disp),%r11 # disp->HandlerData
1404 mov 0(%r11),%r10d # HandlerData[0]
1405 lea (%rsi,%r10),%r10 # end of prologue label
1406 cmp %r10,%rbx # context->Rip<.Lbody
1409 mov 152($context),%rax # pull context->Rsp
1411 mov 4(%r11),%r10d # HandlerData[1]
1412 lea (%rsi,%r10),%r10 # epilogue label
1413 cmp %r10,%rbx # context->Rip>=.Lepilogue
1416 mov `16*17`(%rax),%rax # pull saved stack pointer
1420 mov %rbx,144($context) # restore context->Rbx
1421 mov %rbp,160($context) # restore context->Rbp
1423 lea -24-10*16(%rax),%rsi
1424 lea 512($context),%rdi # &context.Xmm6
1426 .long 0xa548f3fc # cld; rep movsq
1431 mov %rax,152($context) # restore context->Rsp
1432 mov %rsi,168($context) # restore context->Rsi
1433 mov %rdi,176($context) # restore context->Rdi
1435 mov 40($disp),%rdi # disp->ContextRecord
1436 mov $context,%rsi # context
1437 mov \$154,%ecx # sizeof(CONTEXT)
1438 .long 0xa548f3fc # cld; rep movsq
1441 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1442 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1443 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1444 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1445 mov 40(%rsi),%r10 # disp->ContextRecord
1446 lea 56(%rsi),%r11 # &disp->HandlerData
1447 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1448 mov %r10,32(%rsp) # arg5
1449 mov %r11,40(%rsp) # arg6
1450 mov %r12,48(%rsp) # arg7
1451 mov %rcx,56(%rsp) # arg8, (NULL)
1452 call *__imp_RtlVirtualUnwind(%rip)
1454 mov \$1,%eax # ExceptionContinueSearch
1466 .size se_handler,.-se_handler
1468 $code.=<<___ if ($avx>1);
1469 .type avx2_handler,\@abi-omnipotent
1483 mov 120($context),%rax # pull context->Rax
1484 mov 248($context),%rbx # pull context->Rip
1486 mov 8($disp),%rsi # disp->ImageBase
1487 mov 56($disp),%r11 # disp->HandlerData
1489 mov 0(%r11),%r10d # HandlerData[0]
1490 lea (%rsi,%r10),%r10 # end of prologue label
1491 cmp %r10,%rbx # context->Rip<body label
1494 mov 152($context),%rax # pull context->Rsp
1496 mov 4(%r11),%r10d # HandlerData[1]
1497 lea (%rsi,%r10),%r10 # epilogue label
1498 cmp %r10,%rbx # context->Rip>=epilogue label
1501 mov `32*17`($context),%rax # pull saved stack pointer
1509 mov %rbx,144($context) # restore context->Rbx
1510 mov %rbp,160($context) # restore context->Rbp
1511 mov %r12,216($context) # restore context->R12
1512 mov %r13,224($context) # restore context->R13
1513 mov %r14,232($context) # restore context->R14
1514 mov %r15,240($context) # restore context->R15
1516 lea -56-10*16(%rax),%rsi
1517 lea 512($context),%rdi # &context.Xmm6
1519 .long 0xa548f3fc # cld; rep movsq
1522 .size avx2_handler,.-avx2_handler
1527 .rva .LSEH_begin_sha256_multi_block
1528 .rva .LSEH_end_sha256_multi_block
1529 .rva .LSEH_info_sha256_multi_block
1530 .rva .LSEH_begin_sha256_multi_block_shaext
1531 .rva .LSEH_end_sha256_multi_block_shaext
1532 .rva .LSEH_info_sha256_multi_block_shaext
1534 $code.=<<___ if ($avx);
1535 .rva .LSEH_begin_sha256_multi_block_avx
1536 .rva .LSEH_end_sha256_multi_block_avx
1537 .rva .LSEH_info_sha256_multi_block_avx
1539 $code.=<<___ if ($avx>1);
1540 .rva .LSEH_begin_sha256_multi_block_avx2
1541 .rva .LSEH_end_sha256_multi_block_avx2
1542 .rva .LSEH_info_sha256_multi_block_avx2
1547 .LSEH_info_sha256_multi_block:
1550 .rva .Lbody,.Lepilogue # HandlerData[]
1551 .LSEH_info_sha256_multi_block_shaext:
1554 .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
1556 $code.=<<___ if ($avx);
1557 .LSEH_info_sha256_multi_block_avx:
1560 .rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
1562 $code.=<<___ if ($avx>1);
1563 .LSEH_info_sha256_multi_block_avx2:
1566 .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
1569 ####################################################################
1572 local *opcode=shift;
1576 $rex|=0x04 if ($dst>=8);
1577 $rex|=0x01 if ($src>=8);
1578 unshift @opcode,$rex|0x40 if ($rex);
1584 "sha256rnds2" => 0xcb,
1585 "sha256msg1" => 0xcc,
1586 "sha256msg2" => 0xcd );
1588 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1589 my @opcode=(0x0f,0x38);
1590 rex(\@opcode,$2,$1);
1591 push @opcode,$opcodelet{$instr};
1592 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1593 return ".byte\t".join(',',@opcode);
1595 return $instr."\t".@_[0];
1599 foreach (split("\n",$code)) {
1600 s/\`([^\`]*)\`/eval($1)/ge;
1602 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or
1604 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1605 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
1606 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
1607 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1608 s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
1609 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;