2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # Multi-buffer SHA256 procedure processes n buffers in parallel by
18 # placing buffer data to designated lane of SIMD register. n is
19 # naturally limited to 4 on pre-AVX2 processors and to 8 on
20 # AVX2-capable processors such as Haswell.
22 # this +aesni(i) sha256 aesni-sha256 gain(iv)
23 # -------------------------------------------------------------------
24 # Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
25 # Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95%
26 # Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
27 # Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
28 # Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
29 # Skylake (18.9 +5.00=23.9)/n 7.70 8.17 +170%
30 # Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100%
32 # (i) multi-block CBC encrypt with 128-bit key;
33 # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
34 # because of lower AES-NI instruction throughput, nor is there
35 # AES-NI-SHA256 stitch for these processors;
36 # (iii) "this" is for n=8, when we gather twice as much data, result
37 # for n=4 is 20.3+4.44=24.7;
38 # (iv) presented improvement coefficients are asymptotic limits and
39 # in real-life application are somewhat lower, e.g. for 2KB
40 # fragments they range from 75% to 130% (on Haswell);
44 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
50 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
51 die "can't locate x86_64-xlate.pl";
55 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
56 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
57 $avx = ($1>=2.19) + ($1>=2.22);
60 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
61 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
62 $avx = ($1>=2.09) + ($1>=2.10);
65 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
66 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
67 $avx = ($1>=10) + ($1>=11);
70 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
71 $avx = ($2>=3.0) + ($2>3.0);
74 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
77 # void sha256_multi_block (
78 # struct { unsigned int A[8];
85 # unsigned int H[8]; } *ctx,
86 # struct { void *ptr; int blocks; } inp[8],
87 # int num); /* 1 or 2 */
89 $ctx="%rdi"; # 1st arg
90 $inp="%rsi"; # 2nd arg
91 $num="%edx"; # 3rd arg
92 @ptr=map("%r$_",(8..11));
95 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
96 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
103 $off %= 16; $off *= $REG_SZ;
104 $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
108 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
110 $code.=<<___ if ($i<15);
111 movd `4*$i`(@ptr[0]),$Xi
112 movd `4*$i`(@ptr[1]),$t1
113 movd `4*$i`(@ptr[2]),$t2
114 movd `4*$i`(@ptr[3]),$t3
119 $code.=<<___ if ($i==15);
120 movd `4*$i`(@ptr[0]),$Xi
121 lea `16*4`(@ptr[0]),@ptr[0]
122 movd `4*$i`(@ptr[1]),$t1
123 lea `16*4`(@ptr[1]),@ptr[1]
124 movd `4*$i`(@ptr[2]),$t2
125 lea `16*4`(@ptr[2]),@ptr[2]
126 movd `4*$i`(@ptr[3]),$t3
127 lea `16*4`(@ptr[3]),@ptr[3]
134 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)`
136 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)`
140 movdqa $Xi,`&Xi_off($i)`
146 paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round]
151 `"prefetcht0 63(@ptr[0])" if ($i==15)`
153 movdqa $e,$axb # borrow $axb
159 `"prefetcht0 63(@ptr[1])" if ($i==15)`
161 pxor $t3,$sigma # Sigma1(e)
164 paddd $sigma,$Xi # Xi+=Sigma1(e)
165 pxor $axb,$t1 # Ch(e,f,g)
169 pxor $a,$axb # a^b, b^c in next round
171 `"prefetcht0 63(@ptr[2])" if ($i==15)`
174 paddd $t1,$Xi # Xi+=Ch(e,f,g)
179 `"prefetcht0 63(@ptr[3])" if ($i==15)`
185 pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
187 pxor $t3,$sigma # Sigma0(a)
190 paddd $sigma,$h # h+=Sigma0(a)
192 $code.=<<___ if (($i%8)==7);
193 lea `32*8`($Tbl),$Tbl
195 ($axb,$bxc)=($bxc,$axb);
202 movdqa `&Xi_off($i+1)`,$Xn
203 paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9]
211 movdqa `&Xi_off($i+14)`,$t1
215 movdqa $t1,$axb # borrow $axb
223 pxor $t3,$sigma # sigma0(X[i+1])
225 paddd $sigma,$Xi # Xi+=sigma0(e)
231 pxor $t2,$t1 # sigma0(X[i+14])
232 paddd $t1,$Xi # Xi+=sigma1(X[i+14])
241 .extern OPENSSL_ia32cap_P
243 .globl sha256_multi_block
244 .type sha256_multi_block,\@function,3
247 mov OPENSSL_ia32cap_P+4(%rip),%rcx
248 bt \$61,%rcx # check SHA bit
251 $code.=<<___ if ($avx);
260 $code.=<<___ if ($win64);
263 movaps %xmm7,0x10(%rsp)
264 movaps %xmm8,0x20(%rsp)
265 movaps %xmm9,0x30(%rsp)
266 movaps %xmm10,-0x78(%rax)
267 movaps %xmm11,-0x68(%rax)
268 movaps %xmm12,-0x58(%rax)
269 movaps %xmm13,-0x48(%rax)
270 movaps %xmm14,-0x38(%rax)
271 movaps %xmm15,-0x28(%rax)
274 sub \$`$REG_SZ*18`, %rsp
276 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
278 lea K256+128(%rip),$Tbl
279 lea `$REG_SZ*16`(%rsp),%rbx
280 lea 0x80($ctx),$ctx # size optimization
283 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
286 for($i=0;$i<4;$i++) {
288 mov `16*$i+0`($inp),@ptr[$i] # input pointer
289 mov `16*$i+8`($inp),%ecx # number of blocks
291 cmovg %ecx,$num # find maximum
293 mov %ecx,`4*$i`(%rbx) # initialize counters
294 cmovle $Tbl,@ptr[$i] # cancel input
301 movdqu 0x00-0x80($ctx),$A # load context
303 movdqu 0x20-0x80($ctx),$B
304 movdqu 0x40-0x80($ctx),$C
305 movdqu 0x60-0x80($ctx),$D
306 movdqu 0x80-0x80($ctx),$E
307 movdqu 0xa0-0x80($ctx),$F
308 movdqu 0xc0-0x80($ctx),$G
309 movdqu 0xe0-0x80($ctx),$H
310 movdqu .Lpbswap(%rip),$Xn
316 pxor $B,$bxc # magic seed
318 for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
320 movdqu `&Xi_off($i)`,$Xi
326 for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
332 lea K256+128(%rip),$Tbl
334 movdqa (%rbx),$sigma # pull counters
335 cmp 4*0(%rbx),%ecx # examine counters
337 cmovge $Tbl,@ptr[0] # cancel input
342 pcmpgtd $t1,$Xn # mask value
345 paddd $Xn,$sigma # counters--
348 movdqu 0x00-0x80($ctx),$t1
350 movdqu 0x20-0x80($ctx),$t2
352 movdqu 0x40-0x80($ctx),$t3
354 movdqu 0x60-0x80($ctx),$Xi
357 movdqu 0x80-0x80($ctx),$t1
360 movdqu 0xa0-0x80($ctx),$t2
363 movdqu 0xc0-0x80($ctx),$t3
366 movdqu 0xe0-0x80($ctx),$Xi
370 movdqu $A,0x00-0x80($ctx)
372 movdqu $B,0x20-0x80($ctx)
374 movdqu $C,0x40-0x80($ctx)
375 movdqu $D,0x60-0x80($ctx)
376 movdqu $E,0x80-0x80($ctx)
377 movdqu $F,0xa0-0x80($ctx)
378 movdqu $G,0xc0-0x80($ctx)
379 movdqu $H,0xe0-0x80($ctx)
381 movdqa $sigma,(%rbx) # save counters
382 movdqa .Lpbswap(%rip),$Xn
386 mov `$REG_SZ*17+8`(%rsp),$num
387 lea $REG_SZ($ctx),$ctx
388 lea `16*$REG_SZ/4`($inp),$inp
393 mov `$REG_SZ*17`(%rsp),%rax # original %rsp
395 $code.=<<___ if ($win64);
396 movaps -0xb8(%rax),%xmm6
397 movaps -0xa8(%rax),%xmm7
398 movaps -0x98(%rax),%xmm8
399 movaps -0x88(%rax),%xmm9
400 movaps -0x78(%rax),%xmm10
401 movaps -0x68(%rax),%xmm11
402 movaps -0x58(%rax),%xmm12
403 movaps -0x48(%rax),%xmm13
404 movaps -0x38(%rax),%xmm14
405 movaps -0x28(%rax),%xmm15
413 .size sha256_multi_block,.-sha256_multi_block
416 my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
417 my @MSG0=map("%xmm$_",(4..7));
418 my @MSG1=map("%xmm$_",(8..11));
421 .type sha256_multi_block_shaext,\@function,3
423 sha256_multi_block_shaext:
429 $code.=<<___ if ($win64);
432 movaps %xmm7,0x10(%rsp)
433 movaps %xmm8,0x20(%rsp)
434 movaps %xmm9,0x30(%rsp)
435 movaps %xmm10,-0x78(%rax)
436 movaps %xmm11,-0x68(%rax)
437 movaps %xmm12,-0x58(%rax)
438 movaps %xmm13,-0x48(%rax)
439 movaps %xmm14,-0x38(%rax)
440 movaps %xmm15,-0x28(%rax)
443 sub \$`$REG_SZ*18`,%rsp
444 shl \$1,$num # we process pair at a time
446 lea 0x80($ctx),$ctx # size optimization
447 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
449 lea `$REG_SZ*16`(%rsp),%rbx
450 lea K256_shaext+0x80(%rip),$Tbl
453 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
456 for($i=0;$i<2;$i++) {
458 mov `16*$i+0`($inp),@ptr[$i] # input pointer
459 mov `16*$i+8`($inp),%ecx # number of blocks
461 cmovg %ecx,$num # find maximum
463 mov %ecx,`4*$i`(%rbx) # initialize counters
464 cmovle %rsp,@ptr[$i] # cancel input
471 movq 0x00-0x80($ctx),$ABEF0 # A1.A0
472 movq 0x20-0x80($ctx),@MSG0[0] # B1.B0
473 movq 0x40-0x80($ctx),$CDGH0 # C1.C0
474 movq 0x60-0x80($ctx),@MSG0[1] # D1.D0
475 movq 0x80-0x80($ctx),@MSG1[0] # E1.E0
476 movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0
477 movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0
478 movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0
480 punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0
481 punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0
482 punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0
483 punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0
484 movdqa K256_shaext-0x10(%rip),$TMPx # byte swap
488 punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0
489 punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0
490 punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1
491 punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1
493 pshufd \$0b00011011,$ABEF0,$ABEF0
494 pshufd \$0b00011011,$CDGH0,$CDGH0
495 pshufd \$0b00011011,$ABEF1,$ABEF1
496 pshufd \$0b00011011,$CDGH1,$CDGH1
501 movdqu 0x00(@ptr[0]),@MSG0[0]
502 movdqu 0x00(@ptr[1]),@MSG1[0]
503 movdqu 0x10(@ptr[0]),@MSG0[1]
504 movdqu 0x10(@ptr[1]),@MSG1[1]
505 movdqu 0x20(@ptr[0]),@MSG0[2]
506 pshufb $TMPx,@MSG0[0]
507 movdqu 0x20(@ptr[1]),@MSG1[2]
508 pshufb $TMPx,@MSG1[0]
509 movdqu 0x30(@ptr[0]),@MSG0[3]
510 lea 0x40(@ptr[0]),@ptr[0]
511 movdqu 0x30(@ptr[1]),@MSG1[3]
512 lea 0x40(@ptr[1]),@ptr[1]
514 movdqa 0*16-0x80($Tbl),$Wi
515 pshufb $TMPx,@MSG0[1]
517 pxor $ABEF0,@MSG0[0] # black magic
519 movdqa 0*16-0x80($Tbl),$TMP1
520 pshufb $TMPx,@MSG1[1]
522 movdqa $CDGH0,0x50(%rsp) # offload
523 sha256rnds2 $ABEF0,$CDGH0 # 0-3
524 pxor $ABEF1,@MSG1[0] # black magic
526 movdqa $CDGH1,0x70(%rsp)
527 sha256rnds2 $ABEF1,$CDGH1 # 0-3
528 pshufd \$0x0e,$TMP0,$Wi
529 pxor $ABEF0,@MSG0[0] # black magic
530 movdqa $ABEF0,0x40(%rsp) # offload
531 sha256rnds2 $CDGH0,$ABEF0
532 pshufd \$0x0e,$TMP1,$Wi
533 pxor $ABEF1,@MSG1[0] # black magic
534 movdqa $ABEF1,0x60(%rsp)
535 movdqa 1*16-0x80($Tbl),$TMP0
537 pshufb $TMPx,@MSG0[2]
538 sha256rnds2 $CDGH1,$ABEF1
541 movdqa 1*16-0x80($Tbl),$TMP1
543 sha256rnds2 $ABEF0,$CDGH0 # 4-7
545 prefetcht0 127(@ptr[0])
546 pshufb $TMPx,@MSG0[3]
547 pshufb $TMPx,@MSG1[2]
548 prefetcht0 127(@ptr[1])
549 sha256rnds2 $ABEF1,$CDGH1 # 4-7
550 pshufd \$0x0e,$TMP0,$Wi
551 pshufb $TMPx,@MSG1[3]
552 sha256msg1 @MSG0[1],@MSG0[0]
553 sha256rnds2 $CDGH0,$ABEF0
554 pshufd \$0x0e,$TMP1,$Wi
555 movdqa 2*16-0x80($Tbl),$TMP0
557 sha256rnds2 $CDGH1,$ABEF1
560 movdqa 2*16-0x80($Tbl),$TMP1
562 sha256rnds2 $ABEF0,$CDGH0 # 8-11
563 sha256msg1 @MSG1[1],@MSG1[0]
565 movdqa @MSG0[3],$TMPx
566 sha256rnds2 $ABEF1,$CDGH1 # 8-11
567 pshufd \$0x0e,$TMP0,$Wi
568 palignr \$4,@MSG0[2],$TMPx
570 movdqa @MSG1[3],$TMPx
571 palignr \$4,@MSG1[2],$TMPx
572 sha256msg1 @MSG0[2],@MSG0[1]
573 sha256rnds2 $CDGH0,$ABEF0
574 pshufd \$0x0e,$TMP1,$Wi
575 movdqa 3*16-0x80($Tbl),$TMP0
577 sha256rnds2 $CDGH1,$ABEF1
578 sha256msg1 @MSG1[2],@MSG1[1]
581 movdqa 3*16-0x80($Tbl),$TMP1
584 sha256msg2 @MSG0[3],@MSG0[0]
585 sha256rnds2 $ABEF0,$CDGH0 # 12-15
587 movdqa @MSG0[0],$TMPx
588 palignr \$4,@MSG0[3],$TMPx
589 sha256rnds2 $ABEF1,$CDGH1 # 12-15
590 sha256msg2 @MSG1[3],@MSG1[0]
591 pshufd \$0x0e,$TMP0,$Wi
593 movdqa @MSG1[0],$TMPx
594 palignr \$4,@MSG1[3],$TMPx
595 sha256msg1 @MSG0[3],@MSG0[2]
596 sha256rnds2 $CDGH0,$ABEF0
597 pshufd \$0x0e,$TMP1,$Wi
598 movdqa 4*16-0x80($Tbl),$TMP0
600 sha256rnds2 $CDGH1,$ABEF1
601 sha256msg1 @MSG1[3],@MSG1[2]
603 for($i=4;$i<16-3;$i++) {
606 movdqa $i*16-0x80($Tbl),$TMP1
609 sha256msg2 @MSG0[0],@MSG0[1]
610 sha256rnds2 $ABEF0,$CDGH0 # 16-19...
612 movdqa @MSG0[1],$TMPx
613 palignr \$4,@MSG0[0],$TMPx
614 sha256rnds2 $ABEF1,$CDGH1 # 16-19...
615 sha256msg2 @MSG1[0],@MSG1[1]
616 pshufd \$0x0e,$TMP0,$Wi
618 movdqa @MSG1[1],$TMPx
619 palignr \$4,@MSG1[0],$TMPx
620 sha256msg1 @MSG0[0],@MSG0[3]
621 sha256rnds2 $CDGH0,$ABEF0
622 pshufd \$0x0e,$TMP1,$Wi
623 movdqa `($i+1)*16`-0x80($Tbl),$TMP0
625 sha256rnds2 $CDGH1,$ABEF1
626 sha256msg1 @MSG1[0],@MSG1[3]
628 push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
632 movdqa 13*16-0x80($Tbl),$TMP1
635 sha256msg2 @MSG0[0],@MSG0[1]
636 sha256rnds2 $ABEF0,$CDGH0 # 52-55
638 movdqa @MSG0[1],$TMPx
639 palignr \$4,@MSG0[0],$TMPx
640 sha256rnds2 $ABEF1,$CDGH1 # 52-55
641 sha256msg2 @MSG1[0],@MSG1[1]
642 pshufd \$0x0e,$TMP0,$Wi
644 movdqa @MSG1[1],$TMPx
645 palignr \$4,@MSG1[0],$TMPx
647 sha256rnds2 $CDGH0,$ABEF0
648 pshufd \$0x0e,$TMP1,$Wi
649 movdqa 14*16-0x80($Tbl),$TMP0
651 sha256rnds2 $CDGH1,$ABEF1
654 movdqa 14*16-0x80($Tbl),$TMP1
657 sha256msg2 @MSG0[1],@MSG0[2]
659 sha256rnds2 $ABEF0,$CDGH0 # 56-59
662 pxor @MSG0[1],@MSG0[1] # zero
663 sha256rnds2 $ABEF1,$CDGH1 # 56-59
664 sha256msg2 @MSG1[1],@MSG1[2]
665 pshufd \$0x0e,$TMP0,$Wi
666 movdqa 15*16-0x80($Tbl),$TMP0
668 movq (%rbx),@MSG0[2] # pull counters
670 sha256rnds2 $CDGH0,$ABEF0
671 pshufd \$0x0e,$TMP1,$Wi
672 movdqa 15*16-0x80($Tbl),$TMP1
674 sha256rnds2 $CDGH1,$ABEF1
677 cmp 4*0(%rbx),%ecx # examine counters
678 cmovge %rsp,@ptr[0] # cancel input
681 pshufd \$0x00,@MSG0[2],@MSG1[0]
682 sha256rnds2 $ABEF0,$CDGH0 # 60-63
684 pshufd \$0x55,@MSG0[2],@MSG1[1]
685 movdqa @MSG0[2],@MSG1[2]
686 sha256rnds2 $ABEF1,$CDGH1 # 60-63
687 pshufd \$0x0e,$TMP0,$Wi
688 pcmpgtd @MSG0[1],@MSG1[0]
689 pcmpgtd @MSG0[1],@MSG1[1]
690 sha256rnds2 $CDGH0,$ABEF0
691 pshufd \$0x0e,$TMP1,$Wi
692 pcmpgtd @MSG0[1],@MSG1[2] # counter mask
693 movdqa K256_shaext-0x10(%rip),$TMPx
694 sha256rnds2 $CDGH1,$ABEF1
700 paddd @MSG0[2],@MSG1[2] # counters--
702 paddd 0x50(%rsp),$CDGH0
703 paddd 0x70(%rsp),$CDGH1
704 paddd 0x40(%rsp),$ABEF0
705 paddd 0x60(%rsp),$ABEF1
707 movq @MSG1[2],(%rbx) # save counters
711 mov `$REG_SZ*17+8`(%rsp),$num
713 pshufd \$0b00011011,$ABEF0,$ABEF0
714 pshufd \$0b00011011,$CDGH0,$CDGH0
715 pshufd \$0b00011011,$ABEF1,$ABEF1
716 pshufd \$0b00011011,$CDGH1,$CDGH1
718 movdqa $ABEF0,@MSG0[0]
719 movdqa $CDGH0,@MSG0[1]
720 punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0
721 punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0
722 punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0
723 punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0
725 movq $ABEF0,0x00-0x80($ctx) # A1.A0
727 movq @MSG0[0],0x80-0x80($ctx) # E1.E0
729 movq $ABEF0,0x20-0x80($ctx) # B1.B0
730 movq @MSG0[0],0xa0-0x80($ctx) # F1.F0
732 movq $CDGH0,0x40-0x80($ctx) # C1.C0
734 movq @MSG0[1],0xc0-0x80($ctx) # G1.G0
736 movq $CDGH0,0x60-0x80($ctx) # D1.D0
737 movq @MSG0[1],0xe0-0x80($ctx) # H1.H0
739 lea `$REG_SZ/2`($ctx),$ctx
740 lea `16*2`($inp),$inp
742 jnz .Loop_grande_shaext
745 #mov `$REG_SZ*17`(%rsp),%rax # original %rsp
747 $code.=<<___ if ($win64);
748 movaps -0xb8(%rax),%xmm6
749 movaps -0xa8(%rax),%xmm7
750 movaps -0x98(%rax),%xmm8
751 movaps -0x88(%rax),%xmm9
752 movaps -0x78(%rax),%xmm10
753 movaps -0x68(%rax),%xmm11
754 movaps -0x58(%rax),%xmm12
755 movaps -0x48(%rax),%xmm13
756 movaps -0x38(%rax),%xmm14
757 movaps -0x28(%rax),%xmm15
765 .size sha256_multi_block_shaext,.-sha256_multi_block_shaext
769 sub ROUND_00_15_avx {
770 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
772 $code.=<<___ if ($i<15 && $REG_SZ==16);
773 vmovd `4*$i`(@ptr[0]),$Xi
774 vmovd `4*$i`(@ptr[1]),$t1
775 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
776 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
777 vpunpckldq $t1,$Xi,$Xi
780 $code.=<<___ if ($i==15 && $REG_SZ==16);
781 vmovd `4*$i`(@ptr[0]),$Xi
782 lea `16*4`(@ptr[0]),@ptr[0]
783 vmovd `4*$i`(@ptr[1]),$t1
784 lea `16*4`(@ptr[1]),@ptr[1]
785 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
786 lea `16*4`(@ptr[2]),@ptr[2]
787 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
788 lea `16*4`(@ptr[3]),@ptr[3]
789 vpunpckldq $t1,$Xi,$Xi
792 $code.=<<___ if ($i<15 && $REG_SZ==32);
793 vmovd `4*$i`(@ptr[0]),$Xi
794 vmovd `4*$i`(@ptr[4]),$t1
795 vmovd `4*$i`(@ptr[1]),$t2
796 vmovd `4*$i`(@ptr[5]),$t3
797 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
798 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
799 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
800 vpunpckldq $t2,$Xi,$Xi
801 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
802 vpunpckldq $t3,$t1,$t1
803 vinserti128 $t1,$Xi,$Xi
806 $code.=<<___ if ($i==15 && $REG_SZ==32);
807 vmovd `4*$i`(@ptr[0]),$Xi
808 lea `16*4`(@ptr[0]),@ptr[0]
809 vmovd `4*$i`(@ptr[4]),$t1
810 lea `16*4`(@ptr[4]),@ptr[4]
811 vmovd `4*$i`(@ptr[1]),$t2
812 lea `16*4`(@ptr[1]),@ptr[1]
813 vmovd `4*$i`(@ptr[5]),$t3
814 lea `16*4`(@ptr[5]),@ptr[5]
815 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
816 lea `16*4`(@ptr[2]),@ptr[2]
817 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
818 lea `16*4`(@ptr[6]),@ptr[6]
819 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
820 lea `16*4`(@ptr[3]),@ptr[3]
821 vpunpckldq $t2,$Xi,$Xi
822 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
823 lea `16*4`(@ptr[7]),@ptr[7]
824 vpunpckldq $t3,$t1,$t1
825 vinserti128 $t1,$Xi,$Xi
831 vmovdqu $Xi,`&Xi_off($i)`
832 vpaddd $h,$Xi,$Xi # Xi+=h
835 vpxor $t3,$sigma,$sigma
837 vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round]
838 vpxor $t2,$sigma,$sigma
841 vpxor $t3,$sigma,$sigma
842 `"prefetcht0 63(@ptr[0])" if ($i==15)`
845 vpand $f,$e,$axb # borrow $axb
846 `"prefetcht0 63(@ptr[1])" if ($i==15)`
847 vpxor $t2,$sigma,$sigma
849 vpsrld \$2,$a,$h # borrow $h
850 vpxor $t3,$sigma,$sigma # Sigma1(e)
851 `"prefetcht0 63(@ptr[2])" if ($i==15)`
853 vpxor $axb,$t1,$t1 # Ch(e,f,g)
854 vpxor $a,$b,$axb # a^b, b^c in next round
855 `"prefetcht0 63(@ptr[3])" if ($i==15)`
857 vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
860 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
862 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
864 `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
868 vpxor $t3,$sigma,$sigma
869 `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
871 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
872 vpaddd $Xi,$d,$d # d+=Xi
873 `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
874 vpxor $t2,$sigma,$sigma
875 vpxor $t3,$sigma,$sigma # Sigma0(a)
877 vpaddd $Xi,$h,$h # h+=Xi
878 vpaddd $sigma,$h,$h # h+=Sigma0(a)
880 $code.=<<___ if (($i%8)==7);
883 ($axb,$bxc)=($bxc,$axb);
886 sub ROUND_16_XX_avx {
890 vmovdqu `&Xi_off($i+1)`,$Xn
891 vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9]
893 vpsrld \$3,$Xn,$sigma
896 vpxor $t2,$sigma,$sigma
898 vpxor $t3,$sigma,$sigma
900 vmovdqu `&Xi_off($i+14)`,$t1
901 vpsrld \$10,$t1,$axb # borrow $axb
903 vpxor $t2,$sigma,$sigma
905 vpxor $t3,$sigma,$sigma # sigma0(X[i+1])
907 vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e)
908 vpxor $t2,$axb,$sigma
910 vpxor $t3,$sigma,$sigma
912 vpxor $t2,$sigma,$sigma
913 vpxor $t3,$sigma,$sigma # sigma0(X[i+14])
914 vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14])
916 &ROUND_00_15_avx($i,@_);
921 .type sha256_multi_block_avx,\@function,3
923 sha256_multi_block_avx:
926 $code.=<<___ if ($avx>1);
941 $code.=<<___ if ($win64);
944 movaps %xmm7,0x10(%rsp)
945 movaps %xmm8,0x20(%rsp)
946 movaps %xmm9,0x30(%rsp)
947 movaps %xmm10,-0x78(%rax)
948 movaps %xmm11,-0x68(%rax)
949 movaps %xmm12,-0x58(%rax)
950 movaps %xmm13,-0x48(%rax)
951 movaps %xmm14,-0x38(%rax)
952 movaps %xmm15,-0x28(%rax)
955 sub \$`$REG_SZ*18`, %rsp
957 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
959 lea K256+128(%rip),$Tbl
960 lea `$REG_SZ*16`(%rsp),%rbx
961 lea 0x80($ctx),$ctx # size optimization
964 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
967 for($i=0;$i<4;$i++) {
969 mov `16*$i+0`($inp),@ptr[$i] # input pointer
970 mov `16*$i+8`($inp),%ecx # number of blocks
972 cmovg %ecx,$num # find maximum
974 mov %ecx,`4*$i`(%rbx) # initialize counters
975 cmovle $Tbl,@ptr[$i] # cancel input
982 vmovdqu 0x00-0x80($ctx),$A # load context
984 vmovdqu 0x20-0x80($ctx),$B
985 vmovdqu 0x40-0x80($ctx),$C
986 vmovdqu 0x60-0x80($ctx),$D
987 vmovdqu 0x80-0x80($ctx),$E
988 vmovdqu 0xa0-0x80($ctx),$F
989 vmovdqu 0xc0-0x80($ctx),$G
990 vmovdqu 0xe0-0x80($ctx),$H
991 vmovdqu .Lpbswap(%rip),$Xn
996 vpxor $B,$C,$bxc # magic seed
998 for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1000 vmovdqu `&Xi_off($i)`,$Xi
1006 for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1012 lea K256+128(%rip),$Tbl
1014 for($i=0;$i<4;$i++) {
1016 cmp `4*$i`(%rbx),%ecx # examine counters
1017 cmovge $Tbl,@ptr[$i] # cancel input
1021 vmovdqa (%rbx),$sigma # pull counters
1024 vpcmpgtd $t1,$Xn,$Xn # mask value
1025 vpaddd $Xn,$sigma,$sigma # counters--
1027 vmovdqu 0x00-0x80($ctx),$t1
1029 vmovdqu 0x20-0x80($ctx),$t2
1031 vmovdqu 0x40-0x80($ctx),$t3
1033 vmovdqu 0x60-0x80($ctx),$Xi
1036 vmovdqu 0x80-0x80($ctx),$t1
1039 vmovdqu 0xa0-0x80($ctx),$t2
1042 vmovdqu 0xc0-0x80($ctx),$t3
1045 vmovdqu 0xe0-0x80($ctx),$Xi
1049 vmovdqu $A,0x00-0x80($ctx)
1051 vmovdqu $B,0x20-0x80($ctx)
1053 vmovdqu $C,0x40-0x80($ctx)
1054 vmovdqu $D,0x60-0x80($ctx)
1055 vmovdqu $E,0x80-0x80($ctx)
1056 vmovdqu $F,0xa0-0x80($ctx)
1057 vmovdqu $G,0xc0-0x80($ctx)
1058 vmovdqu $H,0xe0-0x80($ctx)
1060 vmovdqu $sigma,(%rbx) # save counters
1061 vmovdqu .Lpbswap(%rip),$Xn
1065 mov `$REG_SZ*17+8`(%rsp),$num
1066 lea $REG_SZ($ctx),$ctx
1067 lea `16*$REG_SZ/4`($inp),$inp
1069 jnz .Loop_grande_avx
1072 mov `$REG_SZ*17`(%rsp),%rax # original %rsp
1075 $code.=<<___ if ($win64);
1076 movaps -0xb8(%rax),%xmm6
1077 movaps -0xa8(%rax),%xmm7
1078 movaps -0x98(%rax),%xmm8
1079 movaps -0x88(%rax),%xmm9
1080 movaps -0x78(%rax),%xmm10
1081 movaps -0x68(%rax),%xmm11
1082 movaps -0x58(%rax),%xmm12
1083 movaps -0x48(%rax),%xmm13
1084 movaps -0x38(%rax),%xmm14
1085 movaps -0x28(%rax),%xmm15
1093 .size sha256_multi_block_avx,.-sha256_multi_block_avx
1096 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1099 @ptr=map("%r$_",(12..15,8..11));
1101 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1102 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1105 .type sha256_multi_block_avx2,\@function,3
1107 sha256_multi_block_avx2:
1117 $code.=<<___ if ($win64);
1118 lea -0xa8(%rsp),%rsp
1120 movaps %xmm7,0x10(%rsp)
1121 movaps %xmm8,0x20(%rsp)
1122 movaps %xmm9,0x30(%rsp)
1123 movaps %xmm10,0x40(%rsp)
1124 movaps %xmm11,0x50(%rsp)
1125 movaps %xmm12,-0x78(%rax)
1126 movaps %xmm13,-0x68(%rax)
1127 movaps %xmm14,-0x58(%rax)
1128 movaps %xmm15,-0x48(%rax)
1131 sub \$`$REG_SZ*18`, %rsp
1133 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
1135 lea K256+128(%rip),$Tbl
1136 lea 0x80($ctx),$ctx # size optimization
1139 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
1141 lea `$REG_SZ*16`(%rsp),%rbx
1143 for($i=0;$i<8;$i++) {
1145 mov `16*$i+0`($inp),@ptr[$i] # input pointer
1146 mov `16*$i+8`($inp),%ecx # number of blocks
1148 cmovg %ecx,$num # find maximum
1150 mov %ecx,`4*$i`(%rbx) # initialize counters
1151 cmovle $Tbl,@ptr[$i] # cancel input
1155 vmovdqu 0x00-0x80($ctx),$A # load context
1157 vmovdqu 0x20-0x80($ctx),$B
1158 lea 256+128(%rsp),%rbx
1159 vmovdqu 0x40-0x80($ctx),$C
1160 vmovdqu 0x60-0x80($ctx),$D
1161 vmovdqu 0x80-0x80($ctx),$E
1162 vmovdqu 0xa0-0x80($ctx),$F
1163 vmovdqu 0xc0-0x80($ctx),$G
1164 vmovdqu 0xe0-0x80($ctx),$H
1165 vmovdqu .Lpbswap(%rip),$Xn
1170 vpxor $B,$C,$bxc # magic seed
1172 for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1174 vmovdqu `&Xi_off($i)`,$Xi
1176 jmp .Loop_16_xx_avx2
1180 for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1183 jnz .Loop_16_xx_avx2
1186 lea `$REG_SZ*16`(%rsp),%rbx
1187 lea K256+128(%rip),$Tbl
1189 for($i=0;$i<8;$i++) {
1191 cmp `4*$i`(%rbx),%ecx # examine counters
1192 cmovge $Tbl,@ptr[$i] # cancel input
1196 vmovdqa (%rbx),$sigma # pull counters
1199 vpcmpgtd $t1,$Xn,$Xn # mask value
1200 vpaddd $Xn,$sigma,$sigma # counters--
1202 vmovdqu 0x00-0x80($ctx),$t1
1204 vmovdqu 0x20-0x80($ctx),$t2
1206 vmovdqu 0x40-0x80($ctx),$t3
1208 vmovdqu 0x60-0x80($ctx),$Xi
1211 vmovdqu 0x80-0x80($ctx),$t1
1214 vmovdqu 0xa0-0x80($ctx),$t2
1217 vmovdqu 0xc0-0x80($ctx),$t3
1220 vmovdqu 0xe0-0x80($ctx),$Xi
1224 vmovdqu $A,0x00-0x80($ctx)
1226 vmovdqu $B,0x20-0x80($ctx)
1228 vmovdqu $C,0x40-0x80($ctx)
1229 vmovdqu $D,0x60-0x80($ctx)
1230 vmovdqu $E,0x80-0x80($ctx)
1231 vmovdqu $F,0xa0-0x80($ctx)
1232 vmovdqu $G,0xc0-0x80($ctx)
1233 vmovdqu $H,0xe0-0x80($ctx)
1235 vmovdqu $sigma,(%rbx) # save counters
1236 lea 256+128(%rsp),%rbx
1237 vmovdqu .Lpbswap(%rip),$Xn
1241 #mov `$REG_SZ*17+8`(%rsp),$num
1242 #lea $REG_SZ($ctx),$ctx
1243 #lea `16*$REG_SZ/4`($inp),$inp
1245 #jnz .Loop_grande_avx2
1248 mov `$REG_SZ*17`(%rsp),%rax # original %rsp
1251 $code.=<<___ if ($win64);
1252 movaps -0xd8(%rax),%xmm6
1253 movaps -0xc8(%rax),%xmm7
1254 movaps -0xb8(%rax),%xmm8
1255 movaps -0xa8(%rax),%xmm9
1256 movaps -0x98(%rax),%xmm10
1257 movaps -0x88(%rax),%xmm11
1258 movaps -0x78(%rax),%xmm12
1259 movaps -0x68(%rax),%xmm13
1260 movaps -0x58(%rax),%xmm14
1261 movaps -0x48(%rax),%xmm15
1273 .size sha256_multi_block_avx2,.-sha256_multi_block_avx2
1288 &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1289 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1290 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1291 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1292 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1293 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1294 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1295 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1296 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1297 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1298 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1299 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1300 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1301 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1302 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1303 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1306 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1307 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1309 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1310 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1311 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1312 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1313 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1314 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1315 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1316 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1317 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1318 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1319 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1320 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1321 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1322 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1323 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1324 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1325 .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1329 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1330 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1337 .extern __imp_RtlVirtualUnwind
1338 .type se_handler,\@abi-omnipotent
1352 mov 120($context),%rax # pull context->Rax
1353 mov 248($context),%rbx # pull context->Rip
1355 mov 8($disp),%rsi # disp->ImageBase
1356 mov 56($disp),%r11 # disp->HandlerData
1358 mov 0(%r11),%r10d # HandlerData[0]
1359 lea (%rsi,%r10),%r10 # end of prologue label
1360 cmp %r10,%rbx # context->Rip<.Lbody
1363 mov 152($context),%rax # pull context->Rsp
1365 mov 4(%r11),%r10d # HandlerData[1]
1366 lea (%rsi,%r10),%r10 # epilogue label
1367 cmp %r10,%rbx # context->Rip>=.Lepilogue
1370 mov `16*17`(%rax),%rax # pull saved stack pointer
1374 mov %rbx,144($context) # restore context->Rbx
1375 mov %rbp,160($context) # restore context->Rbp
1377 lea -24-10*16(%rax),%rsi
1378 lea 512($context),%rdi # &context.Xmm6
1380 .long 0xa548f3fc # cld; rep movsq
1385 mov %rax,152($context) # restore context->Rsp
1386 mov %rsi,168($context) # restore context->Rsi
1387 mov %rdi,176($context) # restore context->Rdi
1389 mov 40($disp),%rdi # disp->ContextRecord
1390 mov $context,%rsi # context
1391 mov \$154,%ecx # sizeof(CONTEXT)
1392 .long 0xa548f3fc # cld; rep movsq
1395 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1396 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1397 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1398 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1399 mov 40(%rsi),%r10 # disp->ContextRecord
1400 lea 56(%rsi),%r11 # &disp->HandlerData
1401 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1402 mov %r10,32(%rsp) # arg5
1403 mov %r11,40(%rsp) # arg6
1404 mov %r12,48(%rsp) # arg7
1405 mov %rcx,56(%rsp) # arg8, (NULL)
1406 call *__imp_RtlVirtualUnwind(%rip)
1408 mov \$1,%eax # ExceptionContinueSearch
1420 .size se_handler,.-se_handler
1422 $code.=<<___ if ($avx>1);
1423 .type avx2_handler,\@abi-omnipotent
1437 mov 120($context),%rax # pull context->Rax
1438 mov 248($context),%rbx # pull context->Rip
1440 mov 8($disp),%rsi # disp->ImageBase
1441 mov 56($disp),%r11 # disp->HandlerData
1443 mov 0(%r11),%r10d # HandlerData[0]
1444 lea (%rsi,%r10),%r10 # end of prologue label
1445 cmp %r10,%rbx # context->Rip<body label
1448 mov 152($context),%rax # pull context->Rsp
1450 mov 4(%r11),%r10d # HandlerData[1]
1451 lea (%rsi,%r10),%r10 # epilogue label
1452 cmp %r10,%rbx # context->Rip>=epilogue label
1455 mov `32*17`($context),%rax # pull saved stack pointer
1463 mov %rbx,144($context) # restore context->Rbx
1464 mov %rbp,160($context) # restore context->Rbp
1465 mov %r12,216($context) # restore cotnext->R12
1466 mov %r13,224($context) # restore cotnext->R13
1467 mov %r14,232($context) # restore cotnext->R14
1468 mov %r15,240($context) # restore cotnext->R15
1470 lea -56-10*16(%rax),%rsi
1471 lea 512($context),%rdi # &context.Xmm6
1473 .long 0xa548f3fc # cld; rep movsq
1476 .size avx2_handler,.-avx2_handler
1481 .rva .LSEH_begin_sha256_multi_block
1482 .rva .LSEH_end_sha256_multi_block
1483 .rva .LSEH_info_sha256_multi_block
1484 .rva .LSEH_begin_sha256_multi_block_shaext
1485 .rva .LSEH_end_sha256_multi_block_shaext
1486 .rva .LSEH_info_sha256_multi_block_shaext
1488 $code.=<<___ if ($avx);
1489 .rva .LSEH_begin_sha256_multi_block_avx
1490 .rva .LSEH_end_sha256_multi_block_avx
1491 .rva .LSEH_info_sha256_multi_block_avx
1493 $code.=<<___ if ($avx>1);
1494 .rva .LSEH_begin_sha256_multi_block_avx2
1495 .rva .LSEH_end_sha256_multi_block_avx2
1496 .rva .LSEH_info_sha256_multi_block_avx2
1501 .LSEH_info_sha256_multi_block:
1504 .rva .Lbody,.Lepilogue # HandlerData[]
1505 .LSEH_info_sha256_multi_block_shaext:
1508 .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
1510 $code.=<<___ if ($avx);
1511 .LSEH_info_sha256_multi_block_avx:
1514 .rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
1516 $code.=<<___ if ($avx>1);
1517 .LSEH_info_sha256_multi_block_avx2:
1520 .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
1523 ####################################################################
1526 local *opcode=shift;
1530 $rex|=0x04 if ($dst>=8);
1531 $rex|=0x01 if ($src>=8);
1532 unshift @opcode,$rex|0x40 if ($rex);
1538 "sha256rnds2" => 0xcb,
1539 "sha256msg1" => 0xcc,
1540 "sha256msg2" => 0xcd );
1542 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1543 my @opcode=(0x0f,0x38);
1544 rex(\@opcode,$2,$1);
1545 push @opcode,$opcodelet{$instr};
1546 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1547 return ".byte\t".join(',',@opcode);
1549 return $instr."\t".@_[0];
1553 foreach (split("\n",$code)) {
1554 s/\`([^\`]*)\`/eval($1)/ge;
1556 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or
1558 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1559 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
1560 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
1561 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1562 s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
1563 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;