3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # Multi-buffer SHA256 procedure processes n buffers in parallel by
11 # placing buffer data to designated lane of SIMD register. n is
12 # naturally limited to 4 on pre-AVX2 processors and to 8 on
13 # AVX2-capable processors such as Haswell.
15 # this +aesni(i) sha256 aesni-sha256 gain(iv)
16 # -------------------------------------------------------------------
17 # Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
18 # Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95%
19 # Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
20 # Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
21 # Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
22 # Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100%
24 # (i) multi-block CBC encrypt with 128-bit key;
25 # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
26 # because of lower AES-NI instruction throughput, nor is there
27 # AES-NI-SHA256 stitch for these processors;
28 # (iii) "this" is for n=8, when we gather twice as much data, result
29 # for n=4 is 20.3+4.44=24.7;
30 # (iv) presented improvement coefficients are asymptotic limits and
31 # in real-life application are somewhat lower, e.g. for 2KB
32 # fragments they range from 75% to 130% (on Haswell);
36 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
38 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
40 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
41 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
42 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
43 die "can't locate x86_64-xlate.pl";
47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49 $avx = ($1>=2.19) + ($1>=2.22);
52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54 $avx = ($1>=2.09) + ($1>=2.10);
57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59 $avx = ($1>=10) + ($1>=11);
62 if (!$avx && `$ENV{CC} -v 2>&1` =~ /LLVM ([3-9]\.[0-9]+)/) {
63 $avx = ($1>=3.0) + ($1>=3.1);
66 open OUT,"| \"$^X\" $xlate $flavour $output";
69 # void sha256_multi_block (
70 # struct { unsigned int A[8];
77 # unsigned int H[8]; } *ctx,
78 # struct { void *ptr; int blocks; } inp[8],
79 # int num); /* 1 or 2 */
81 $ctx="%rdi"; # 1st arg
82 $inp="%rsi"; # 2nd arg
83 $num="%edx"; # 3rd arg
84 @ptr=map("%r$_",(8..11));
87 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
88 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
95 $off %= 16; $off *= $REG_SZ;
96 $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
100 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
102 $code.=<<___ if ($i<15);
103 movd `4*$i`(@ptr[0]),$Xi
104 movd `4*$i`(@ptr[1]),$t1
105 movd `4*$i`(@ptr[2]),$t2
106 movd `4*$i`(@ptr[3]),$t3
111 $code.=<<___ if ($i==15);
112 movd `4*$i`(@ptr[0]),$Xi
113 lea `16*4`(@ptr[0]),@ptr[0]
114 movd `4*$i`(@ptr[1]),$t1
115 lea `16*4`(@ptr[1]),@ptr[1]
116 movd `4*$i`(@ptr[2]),$t2
117 lea `16*4`(@ptr[2]),@ptr[2]
118 movd `4*$i`(@ptr[3]),$t3
119 lea `16*4`(@ptr[3]),@ptr[3]
126 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)`
128 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)`
132 movdqa $Xi,`&Xi_off($i)`
138 paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round]
143 `"prefetcht0 63(@ptr[0])" if ($i==15)`
145 movdqa $e,$axb # borrow $axb
151 `"prefetcht0 63(@ptr[1])" if ($i==15)`
153 pxor $t3,$sigma # Sigma1(e)
156 paddd $sigma,$Xi # Xi+=Sigma1(e)
157 pxor $axb,$t1 # Ch(e,f,g)
161 pxor $a,$axb # a^b, b^c in next round
163 `"prefetcht0 63(@ptr[2])" if ($i==15)`
166 paddd $t1,$Xi # Xi+=Ch(e,f,g)
171 `"prefetcht0 63(@ptr[3])" if ($i==15)`
177 pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
179 pxor $t3,$sigma # Sigma0(a)
182 paddd $sigma,$h # h+=Sigma0(a)
184 $code.=<<___ if (($i%8)==7);
185 lea `32*8`($Tbl),$Tbl
187 ($axb,$bxc)=($bxc,$axb);
194 movdqa `&Xi_off($i+1)`,$Xn
195 paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9]
203 movdqa `&Xi_off($i+14)`,$t1
207 movdqa $t1,$axb # borrow $axb
215 pxor $t3,$sigma # sigma0(X[i+1])
217 paddd $sigma,$Xi # Xi+=sigma0(e)
223 pxor $t2,$t1 # sigma0(X[i+14])
224 paddd $t1,$Xi # Xi+=sigma1(X[i+14])
233 .extern OPENSSL_ia32cap_P
235 .globl sha256_multi_block
236 .type sha256_multi_block,\@function,3
239 mov OPENSSL_ia32cap_P+4(%rip),%rcx
240 bt \$61,%rcx # check SHA bit
243 $code.=<<___ if ($avx);
252 $code.=<<___ if ($win64);
255 movaps %xmm7,0x10(%rsp)
256 movaps %xmm8,0x20(%rsp)
257 movaps %xmm9,0x30(%rsp)
258 movaps %xmm10,-0x78(%rax)
259 movaps %xmm11,-0x68(%rax)
260 movaps %xmm12,-0x58(%rax)
261 movaps %xmm13,-0x48(%rax)
262 movaps %xmm14,-0x38(%rax)
263 movaps %xmm15,-0x28(%rax)
266 sub \$`$REG_SZ*18`, %rsp
268 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
270 lea K256+128(%rip),$Tbl
271 lea `$REG_SZ*16`(%rsp),%rbx
272 lea 0x80($ctx),$ctx # size optimization
275 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
278 for($i=0;$i<4;$i++) {
280 mov `16*$i+0`($inp),@ptr[$i] # input pointer
281 mov `16*$i+8`($inp),%ecx # number of blocks
283 cmovg %ecx,$num # find maximum
285 mov %ecx,`4*$i`(%rbx) # initialize counters
286 cmovle $Tbl,@ptr[$i] # cancel input
293 movdqu 0x00-0x80($ctx),$A # load context
295 movdqu 0x20-0x80($ctx),$B
296 movdqu 0x40-0x80($ctx),$C
297 movdqu 0x60-0x80($ctx),$D
298 movdqu 0x80-0x80($ctx),$E
299 movdqu 0xa0-0x80($ctx),$F
300 movdqu 0xc0-0x80($ctx),$G
301 movdqu 0xe0-0x80($ctx),$H
302 movdqu .Lpbswap(%rip),$Xn
308 pxor $B,$bxc # magic seed
310 for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
312 movdqu `&Xi_off($i)`,$Xi
318 for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
324 lea K256+128(%rip),$Tbl
326 movdqa (%rbx),$sigma # pull counters
327 cmp 4*0(%rbx),%ecx # examine counters
329 cmovge $Tbl,@ptr[0] # cancel input
334 pcmpgtd $t1,$Xn # mask value
337 paddd $Xn,$sigma # counters--
340 movdqu 0x00-0x80($ctx),$t1
342 movdqu 0x20-0x80($ctx),$t2
344 movdqu 0x40-0x80($ctx),$t3
346 movdqu 0x60-0x80($ctx),$Xi
349 movdqu 0x80-0x80($ctx),$t1
352 movdqu 0xa0-0x80($ctx),$t2
355 movdqu 0xc0-0x80($ctx),$t3
358 movdqu 0xe0-0x80($ctx),$Xi
362 movdqu $A,0x00-0x80($ctx)
364 movdqu $B,0x20-0x80($ctx)
366 movdqu $C,0x40-0x80($ctx)
367 movdqu $D,0x60-0x80($ctx)
368 movdqu $E,0x80-0x80($ctx)
369 movdqu $F,0xa0-0x80($ctx)
370 movdqu $G,0xc0-0x80($ctx)
371 movdqu $H,0xe0-0x80($ctx)
373 movdqa $sigma,(%rbx) # save counters
374 movdqa .Lpbswap(%rip),$Xn
378 mov `$REG_SZ*17+8`(%rsp),$num
379 lea $REG_SZ($ctx),$ctx
380 lea `16*$REG_SZ/4`($inp),$inp
385 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
387 $code.=<<___ if ($win64);
388 movaps -0xb8(%rax),%xmm6
389 movaps -0xa8(%rax),%xmm7
390 movaps -0x98(%rax),%xmm8
391 movaps -0x88(%rax),%xmm9
392 movaps -0x78(%rax),%xmm10
393 movaps -0x68(%rax),%xmm11
394 movaps -0x58(%rax),%xmm12
395 movaps -0x48(%rax),%xmm13
396 movaps -0x38(%rax),%xmm14
397 movaps -0x28(%rax),%xmm15
405 .size sha256_multi_block,.-sha256_multi_block
408 my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
409 my @MSG0=map("%xmm$_",(4..7));
410 my @MSG1=map("%xmm$_",(8..11));
413 .type sha256_multi_block_shaext,\@function,3
415 sha256_multi_block_shaext:
421 $code.=<<___ if ($win64);
424 movaps %xmm7,0x10(%rsp)
425 movaps %xmm8,0x20(%rsp)
426 movaps %xmm9,0x30(%rsp)
427 movaps %xmm10,-0x78(%rax)
428 movaps %xmm11,-0x68(%rax)
429 movaps %xmm12,-0x58(%rax)
430 movaps %xmm13,-0x48(%rax)
431 movaps %xmm14,-0x38(%rax)
432 movaps %xmm15,-0x28(%rax)
435 sub \$`$REG_SZ*18`,%rsp
436 shl \$1,$num # we process pair at a time
438 lea 0x80($ctx),$ctx # size optimization
439 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
441 lea `$REG_SZ*16`(%rsp),%rbx
442 lea K256_shaext+0x80(%rip),$Tbl
445 mov $num,`$REG_SZ*17+8`(%rsp) # orignal $num
448 for($i=0;$i<2;$i++) {
450 mov `16*$i+0`($inp),@ptr[$i] # input pointer
451 mov `16*$i+8`($inp),%ecx # number of blocks
453 cmovg %ecx,$num # find maximum
455 mov %ecx,`4*$i`(%rbx) # initialize counters
456 cmovle %rsp,@ptr[$i] # cancel input
463 movq 0x00-0x80($ctx),$ABEF0 # A1.A0
464 movq 0x20-0x80($ctx),@MSG0[0] # B1.B0
465 movq 0x40-0x80($ctx),$CDGH0 # C1.C0
466 movq 0x60-0x80($ctx),@MSG0[1] # D1.D0
467 movq 0x80-0x80($ctx),@MSG1[0] # E1.E0
468 movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0
469 movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0
470 movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0
472 punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0
473 punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0
474 punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0
475 punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0
476 movdqa K256_shaext-0x10(%rip),$TMPx # byte swap
480 punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0
481 punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0
482 punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1
483 punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1
485 pshufd \$0b00011011,$ABEF0,$ABEF0
486 pshufd \$0b00011011,$CDGH0,$CDGH0
487 pshufd \$0b00011011,$ABEF1,$ABEF1
488 pshufd \$0b00011011,$CDGH1,$CDGH1
493 movdqu 0x00(@ptr[0]),@MSG0[0]
494 movdqu 0x00(@ptr[1]),@MSG1[0]
495 movdqu 0x10(@ptr[0]),@MSG0[1]
496 movdqu 0x10(@ptr[1]),@MSG1[1]
497 movdqu 0x20(@ptr[0]),@MSG0[2]
498 pshufb $TMPx,@MSG0[0]
499 movdqu 0x20(@ptr[1]),@MSG1[2]
500 pshufb $TMPx,@MSG1[0]
501 movdqu 0x30(@ptr[0]),@MSG0[3]
502 lea 0x40(@ptr[0]),@ptr[0]
503 movdqu 0x30(@ptr[1]),@MSG1[3]
504 lea 0x40(@ptr[1]),@ptr[1]
506 movdqa 0*16-0x80($Tbl),$Wi
507 pshufb $TMPx,@MSG0[1]
509 pxor $ABEF0,@MSG0[0] # black magic
511 movdqa 0*16-0x80($Tbl),$TMP1
512 pshufb $TMPx,@MSG1[1]
514 movdqa $CDGH0,0x50(%rsp) # offload
515 sha256rnds2 $ABEF0,$CDGH0 # 0-3
516 pxor $ABEF1,@MSG1[0] # black magic
518 movdqa $CDGH1,0x70(%rsp)
519 sha256rnds2 $ABEF1,$CDGH1 # 0-3
520 pshufd \$0x0e,$TMP0,$Wi
521 pxor $ABEF0,@MSG0[0] # black magic
522 movdqa $ABEF0,0x40(%rsp) # offload
523 sha256rnds2 $CDGH0,$ABEF0
524 pshufd \$0x0e,$TMP1,$Wi
525 pxor $ABEF1,@MSG1[0] # black magic
526 movdqa $ABEF1,0x60(%rsp)
527 movdqa 1*16-0x80($Tbl),$TMP0
529 pshufb $TMPx,@MSG0[2]
530 sha256rnds2 $CDGH1,$ABEF1
533 movdqa 1*16-0x80($Tbl),$TMP1
535 sha256rnds2 $ABEF0,$CDGH0 # 4-7
537 prefetcht0 127(@ptr[0])
538 pshufb $TMPx,@MSG0[3]
539 pshufb $TMPx,@MSG1[2]
540 prefetcht0 127(@ptr[1])
541 sha256rnds2 $ABEF1,$CDGH1 # 4-7
542 pshufd \$0x0e,$TMP0,$Wi
543 pshufb $TMPx,@MSG1[3]
544 sha256msg1 @MSG0[1],@MSG0[0]
545 sha256rnds2 $CDGH0,$ABEF0
546 pshufd \$0x0e,$TMP1,$Wi
547 movdqa 2*16-0x80($Tbl),$TMP0
549 sha256rnds2 $CDGH1,$ABEF1
552 movdqa 2*16-0x80($Tbl),$TMP1
554 sha256rnds2 $ABEF0,$CDGH0 # 8-11
555 sha256msg1 @MSG1[1],@MSG1[0]
557 movdqa @MSG0[3],$TMPx
558 sha256rnds2 $ABEF1,$CDGH1 # 8-11
559 pshufd \$0x0e,$TMP0,$Wi
560 palignr \$4,@MSG0[2],$TMPx
562 movdqa @MSG1[3],$TMPx
563 palignr \$4,@MSG1[2],$TMPx
564 sha256msg1 @MSG0[2],@MSG0[1]
565 sha256rnds2 $CDGH0,$ABEF0
566 pshufd \$0x0e,$TMP1,$Wi
567 movdqa 3*16-0x80($Tbl),$TMP0
569 sha256rnds2 $CDGH1,$ABEF1
570 sha256msg1 @MSG1[2],@MSG1[1]
573 movdqa 3*16-0x80($Tbl),$TMP1
576 sha256msg2 @MSG0[3],@MSG0[0]
577 sha256rnds2 $ABEF0,$CDGH0 # 12-15
579 movdqa @MSG0[0],$TMPx
580 palignr \$4,@MSG0[3],$TMPx
581 sha256rnds2 $ABEF1,$CDGH1 # 12-15
582 sha256msg2 @MSG1[3],@MSG1[0]
583 pshufd \$0x0e,$TMP0,$Wi
585 movdqa @MSG1[0],$TMPx
586 palignr \$4,@MSG1[3],$TMPx
587 sha256msg1 @MSG0[3],@MSG0[2]
588 sha256rnds2 $CDGH0,$ABEF0
589 pshufd \$0x0e,$TMP1,$Wi
590 movdqa 4*16-0x80($Tbl),$TMP0
592 sha256rnds2 $CDGH1,$ABEF1
593 sha256msg1 @MSG1[3],@MSG1[2]
595 for($i=4;$i<16-3;$i++) {
598 movdqa $i*16-0x80($Tbl),$TMP1
601 sha256msg2 @MSG0[0],@MSG0[1]
602 sha256rnds2 $ABEF0,$CDGH0 # 16-19...
604 movdqa @MSG0[1],$TMPx
605 palignr \$4,@MSG0[0],$TMPx
606 sha256rnds2 $ABEF1,$CDGH1 # 16-19...
607 sha256msg2 @MSG1[0],@MSG1[1]
608 pshufd \$0x0e,$TMP0,$Wi
610 movdqa @MSG1[1],$TMPx
611 palignr \$4,@MSG1[0],$TMPx
612 sha256msg1 @MSG0[0],@MSG0[3]
613 sha256rnds2 $CDGH0,$ABEF0
614 pshufd \$0x0e,$TMP1,$Wi
615 movdqa `($i+1)*16`-0x80($Tbl),$TMP0
617 sha256rnds2 $CDGH1,$ABEF1
618 sha256msg1 @MSG1[0],@MSG1[3]
620 push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1));
624 movdqa 13*16-0x80($Tbl),$TMP1
627 sha256msg2 @MSG0[0],@MSG0[1]
628 sha256rnds2 $ABEF0,$CDGH0 # 52-55
630 movdqa @MSG0[1],$TMPx
631 palignr \$4,@MSG0[0],$TMPx
632 sha256rnds2 $ABEF1,$CDGH1 # 52-55
633 sha256msg2 @MSG1[0],@MSG1[1]
634 pshufd \$0x0e,$TMP0,$Wi
636 movdqa @MSG1[1],$TMPx
637 palignr \$4,@MSG1[0],$TMPx
639 sha256rnds2 $CDGH0,$ABEF0
640 pshufd \$0x0e,$TMP1,$Wi
641 movdqa 14*16-0x80($Tbl),$TMP0
643 sha256rnds2 $CDGH1,$ABEF1
646 movdqa 14*16-0x80($Tbl),$TMP1
649 sha256msg2 @MSG0[1],@MSG0[2]
651 sha256rnds2 $ABEF0,$CDGH0 # 56-59
654 pxor @MSG0[1],@MSG0[1] # zero
655 sha256rnds2 $ABEF1,$CDGH1 # 56-59
656 sha256msg2 @MSG1[1],@MSG1[2]
657 pshufd \$0x0e,$TMP0,$Wi
658 movdqa 15*16-0x80($Tbl),$TMP0
660 movq (%rbx),@MSG0[2] # pull counters
662 sha256rnds2 $CDGH0,$ABEF0
663 pshufd \$0x0e,$TMP1,$Wi
664 movdqa 15*16-0x80($Tbl),$TMP1
666 sha256rnds2 $CDGH1,$ABEF1
669 cmp 4*0(%rbx),%ecx # examine counters
670 cmovge %rsp,@ptr[0] # cancel input
673 pshufd \$0x00,@MSG0[2],@MSG1[0]
674 sha256rnds2 $ABEF0,$CDGH0 # 60-63
676 pshufd \$0x55,@MSG0[2],@MSG1[1]
677 movdqa @MSG0[2],@MSG1[2]
678 sha256rnds2 $ABEF1,$CDGH1 # 60-63
679 pshufd \$0x0e,$TMP0,$Wi
680 pcmpgtd @MSG0[1],@MSG1[0]
681 pcmpgtd @MSG0[1],@MSG1[1]
682 sha256rnds2 $CDGH0,$ABEF0
683 pshufd \$0x0e,$TMP1,$Wi
684 pcmpgtd @MSG0[1],@MSG1[2] # counter mask
685 movdqa K256_shaext-0x10(%rip),$TMPx
686 sha256rnds2 $CDGH1,$ABEF1
692 paddd @MSG0[2],@MSG1[2] # counters--
694 paddd 0x50(%rsp),$CDGH0
695 paddd 0x70(%rsp),$CDGH1
696 paddd 0x40(%rsp),$ABEF0
697 paddd 0x60(%rsp),$ABEF1
699 movq @MSG1[2],(%rbx) # save counters
703 mov `$REG_SZ*17+8`(%rsp),$num
705 pshufd \$0b00011011,$ABEF0,$ABEF0
706 pshufd \$0b00011011,$CDGH0,$CDGH0
707 pshufd \$0b00011011,$ABEF1,$ABEF1
708 pshufd \$0b00011011,$CDGH1,$CDGH1
710 movdqa $ABEF0,@MSG0[0]
711 movdqa $CDGH0,@MSG0[1]
712 punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0
713 punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0
714 punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0
715 punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0
717 movq $ABEF0,0x00-0x80($ctx) # A1.A0
719 movq @MSG0[0],0x80-0x80($ctx) # E1.E0
721 movq $ABEF0,0x20-0x80($ctx) # B1.B0
722 movq @MSG0[0],0xa0-0x80($ctx) # F1.F0
724 movq $CDGH0,0x40-0x80($ctx) # C1.C0
726 movq @MSG0[1],0xc0-0x80($ctx) # G1.G0
728 movq $CDGH0,0x60-0x80($ctx) # D1.D0
729 movq @MSG0[1],0xe0-0x80($ctx) # H1.H0
731 lea `$REG_SZ/2`($ctx),$ctx
732 lea `16*2`($inp),$inp
734 jnz .Loop_grande_shaext
737 #mov `$REG_SZ*17`(%rsp),%rax # original %rsp
739 $code.=<<___ if ($win64);
740 movaps -0xb8(%rax),%xmm6
741 movaps -0xa8(%rax),%xmm7
742 movaps -0x98(%rax),%xmm8
743 movaps -0x88(%rax),%xmm9
744 movaps -0x78(%rax),%xmm10
745 movaps -0x68(%rax),%xmm11
746 movaps -0x58(%rax),%xmm12
747 movaps -0x48(%rax),%xmm13
748 movaps -0x38(%rax),%xmm14
749 movaps -0x28(%rax),%xmm15
757 .size sha256_multi_block_shaext,.-sha256_multi_block_shaext
761 sub ROUND_00_15_avx {
762 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
764 $code.=<<___ if ($i<15 && $REG_SZ==16);
765 vmovd `4*$i`(@ptr[0]),$Xi
766 vmovd `4*$i`(@ptr[1]),$t1
767 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
768 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
769 vpunpckldq $t1,$Xi,$Xi
772 $code.=<<___ if ($i==15 && $REG_SZ==16);
773 vmovd `4*$i`(@ptr[0]),$Xi
774 lea `16*4`(@ptr[0]),@ptr[0]
775 vmovd `4*$i`(@ptr[1]),$t1
776 lea `16*4`(@ptr[1]),@ptr[1]
777 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
778 lea `16*4`(@ptr[2]),@ptr[2]
779 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
780 lea `16*4`(@ptr[3]),@ptr[3]
781 vpunpckldq $t1,$Xi,$Xi
784 $code.=<<___ if ($i<15 && $REG_SZ==32);
785 vmovd `4*$i`(@ptr[0]),$Xi
786 vmovd `4*$i`(@ptr[4]),$t1
787 vmovd `4*$i`(@ptr[1]),$t2
788 vmovd `4*$i`(@ptr[5]),$t3
789 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
790 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
791 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
792 vpunpckldq $t2,$Xi,$Xi
793 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
794 vpunpckldq $t3,$t1,$t1
795 vinserti128 $t1,$Xi,$Xi
798 $code.=<<___ if ($i==15 && $REG_SZ==32);
799 vmovd `4*$i`(@ptr[0]),$Xi
800 lea `16*4`(@ptr[0]),@ptr[0]
801 vmovd `4*$i`(@ptr[4]),$t1
802 lea `16*4`(@ptr[4]),@ptr[4]
803 vmovd `4*$i`(@ptr[1]),$t2
804 lea `16*4`(@ptr[1]),@ptr[1]
805 vmovd `4*$i`(@ptr[5]),$t3
806 lea `16*4`(@ptr[5]),@ptr[5]
807 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
808 lea `16*4`(@ptr[2]),@ptr[2]
809 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
810 lea `16*4`(@ptr[6]),@ptr[6]
811 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
812 lea `16*4`(@ptr[3]),@ptr[3]
813 vpunpckldq $t2,$Xi,$Xi
814 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
815 lea `16*4`(@ptr[7]),@ptr[7]
816 vpunpckldq $t3,$t1,$t1
817 vinserti128 $t1,$Xi,$Xi
823 vmovdqu $Xi,`&Xi_off($i)`
824 vpaddd $h,$Xi,$Xi # Xi+=h
827 vpxor $t3,$sigma,$sigma
829 vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round]
830 vpxor $t2,$sigma,$sigma
833 vpxor $t3,$sigma,$sigma
834 `"prefetcht0 63(@ptr[0])" if ($i==15)`
837 vpand $f,$e,$axb # borrow $axb
838 `"prefetcht0 63(@ptr[1])" if ($i==15)`
839 vpxor $t2,$sigma,$sigma
841 vpsrld \$2,$a,$h # borrow $h
842 vpxor $t3,$sigma,$sigma # Sigma1(e)
843 `"prefetcht0 63(@ptr[2])" if ($i==15)`
845 vpxor $axb,$t1,$t1 # Ch(e,f,g)
846 vpxor $a,$b,$axb # a^b, b^c in next round
847 `"prefetcht0 63(@ptr[3])" if ($i==15)`
849 vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
852 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
854 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
856 `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
860 vpxor $t3,$sigma,$sigma
861 `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
863 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
864 vpaddd $Xi,$d,$d # d+=Xi
865 `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
866 vpxor $t2,$sigma,$sigma
867 vpxor $t3,$sigma,$sigma # Sigma0(a)
869 vpaddd $Xi,$h,$h # h+=Xi
870 vpaddd $sigma,$h,$h # h+=Sigma0(a)
872 $code.=<<___ if (($i%8)==7);
875 ($axb,$bxc)=($bxc,$axb);
878 sub ROUND_16_XX_avx {
882 vmovdqu `&Xi_off($i+1)`,$Xn
883 vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9]
885 vpsrld \$3,$Xn,$sigma
888 vpxor $t2,$sigma,$sigma
890 vpxor $t3,$sigma,$sigma
892 vmovdqu `&Xi_off($i+14)`,$t1
893 vpsrld \$10,$t1,$axb # borrow $axb
895 vpxor $t2,$sigma,$sigma
897 vpxor $t3,$sigma,$sigma # sigma0(X[i+1])
899 vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e)
900 vpxor $t2,$axb,$sigma
902 vpxor $t3,$sigma,$sigma
904 vpxor $t2,$sigma,$sigma
905 vpxor $t3,$sigma,$sigma # sigma0(X[i+14])
906 vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14])
908 &ROUND_00_15_avx($i,@_);
913 .type sha256_multi_block_avx,\@function,3
915 sha256_multi_block_avx:
918 $code.=<<___ if ($avx>1);
933 $code.=<<___ if ($win64);
936 movaps %xmm7,0x10(%rsp)
937 movaps %xmm8,0x20(%rsp)
938 movaps %xmm9,0x30(%rsp)
939 movaps %xmm10,-0x78(%rax)
940 movaps %xmm11,-0x68(%rax)
941 movaps %xmm12,-0x58(%rax)
942 movaps %xmm13,-0x48(%rax)
943 movaps %xmm14,-0x38(%rax)
944 movaps %xmm15,-0x28(%rax)
947 sub \$`$REG_SZ*18`, %rsp
949 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
951 lea K256+128(%rip),$Tbl
952 lea `$REG_SZ*16`(%rsp),%rbx
953 lea 0x80($ctx),$ctx # size optimization
956 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
959 for($i=0;$i<4;$i++) {
961 mov `16*$i+0`($inp),@ptr[$i] # input pointer
962 mov `16*$i+8`($inp),%ecx # number of blocks
964 cmovg %ecx,$num # find maximum
966 mov %ecx,`4*$i`(%rbx) # initialize counters
967 cmovle $Tbl,@ptr[$i] # cancel input
974 vmovdqu 0x00-0x80($ctx),$A # load context
976 vmovdqu 0x20-0x80($ctx),$B
977 vmovdqu 0x40-0x80($ctx),$C
978 vmovdqu 0x60-0x80($ctx),$D
979 vmovdqu 0x80-0x80($ctx),$E
980 vmovdqu 0xa0-0x80($ctx),$F
981 vmovdqu 0xc0-0x80($ctx),$G
982 vmovdqu 0xe0-0x80($ctx),$H
983 vmovdqu .Lpbswap(%rip),$Xn
988 vpxor $B,$C,$bxc # magic seed
990 for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
992 vmovdqu `&Xi_off($i)`,$Xi
998 for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1004 lea K256+128(%rip),$Tbl
1006 for($i=0;$i<4;$i++) {
1008 cmp `4*$i`(%rbx),%ecx # examine counters
1009 cmovge $Tbl,@ptr[$i] # cancel input
1013 vmovdqa (%rbx),$sigma # pull counters
1016 vpcmpgtd $t1,$Xn,$Xn # mask value
1017 vpaddd $Xn,$sigma,$sigma # counters--
1019 vmovdqu 0x00-0x80($ctx),$t1
1021 vmovdqu 0x20-0x80($ctx),$t2
1023 vmovdqu 0x40-0x80($ctx),$t3
1025 vmovdqu 0x60-0x80($ctx),$Xi
1028 vmovdqu 0x80-0x80($ctx),$t1
1031 vmovdqu 0xa0-0x80($ctx),$t2
1034 vmovdqu 0xc0-0x80($ctx),$t3
1037 vmovdqu 0xe0-0x80($ctx),$Xi
1041 vmovdqu $A,0x00-0x80($ctx)
1043 vmovdqu $B,0x20-0x80($ctx)
1045 vmovdqu $C,0x40-0x80($ctx)
1046 vmovdqu $D,0x60-0x80($ctx)
1047 vmovdqu $E,0x80-0x80($ctx)
1048 vmovdqu $F,0xa0-0x80($ctx)
1049 vmovdqu $G,0xc0-0x80($ctx)
1050 vmovdqu $H,0xe0-0x80($ctx)
1052 vmovdqu $sigma,(%rbx) # save counters
1053 vmovdqu .Lpbswap(%rip),$Xn
1057 mov `$REG_SZ*17+8`(%rsp),$num
1058 lea $REG_SZ($ctx),$ctx
1059 lea `16*$REG_SZ/4`($inp),$inp
1061 jnz .Loop_grande_avx
1064 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
1067 $code.=<<___ if ($win64);
1068 movaps -0xb8(%rax),%xmm6
1069 movaps -0xa8(%rax),%xmm7
1070 movaps -0x98(%rax),%xmm8
1071 movaps -0x88(%rax),%xmm9
1072 movaps -0x78(%rax),%xmm10
1073 movaps -0x68(%rax),%xmm11
1074 movaps -0x58(%rax),%xmm12
1075 movaps -0x48(%rax),%xmm13
1076 movaps -0x38(%rax),%xmm14
1077 movaps -0x28(%rax),%xmm15
1085 .size sha256_multi_block_avx,.-sha256_multi_block_avx
1088 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1091 @ptr=map("%r$_",(12..15,8..11));
1093 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1094 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1097 .type sha256_multi_block_avx2,\@function,3
1099 sha256_multi_block_avx2:
1109 $code.=<<___ if ($win64);
1110 lea -0xa8(%rsp),%rsp
1112 movaps %xmm7,0x10(%rsp)
1113 movaps %xmm8,0x20(%rsp)
1114 movaps %xmm9,0x30(%rsp)
1115 movaps %xmm10,0x40(%rsp)
1116 movaps %xmm11,0x50(%rsp)
1117 movaps %xmm12,-0x78(%rax)
1118 movaps %xmm13,-0x68(%rax)
1119 movaps %xmm14,-0x58(%rax)
1120 movaps %xmm15,-0x48(%rax)
1123 sub \$`$REG_SZ*18`, %rsp
1125 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
1127 lea K256+128(%rip),$Tbl
1128 lea 0x80($ctx),$ctx # size optimization
1131 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
1133 lea `$REG_SZ*16`(%rsp),%rbx
1135 for($i=0;$i<8;$i++) {
1137 mov `16*$i+0`($inp),@ptr[$i] # input pointer
1138 mov `16*$i+8`($inp),%ecx # number of blocks
1140 cmovg %ecx,$num # find maximum
1142 mov %ecx,`4*$i`(%rbx) # initialize counters
1143 cmovle $Tbl,@ptr[$i] # cancel input
1147 vmovdqu 0x00-0x80($ctx),$A # load context
1149 vmovdqu 0x20-0x80($ctx),$B
1150 lea 256+128(%rsp),%rbx
1151 vmovdqu 0x40-0x80($ctx),$C
1152 vmovdqu 0x60-0x80($ctx),$D
1153 vmovdqu 0x80-0x80($ctx),$E
1154 vmovdqu 0xa0-0x80($ctx),$F
1155 vmovdqu 0xc0-0x80($ctx),$G
1156 vmovdqu 0xe0-0x80($ctx),$H
1157 vmovdqu .Lpbswap(%rip),$Xn
1162 vpxor $B,$C,$bxc # magic seed
1164 for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1166 vmovdqu `&Xi_off($i)`,$Xi
1168 jmp .Loop_16_xx_avx2
1172 for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1175 jnz .Loop_16_xx_avx2
1178 lea `$REG_SZ*16`(%rsp),%rbx
1179 lea K256+128(%rip),$Tbl
1181 for($i=0;$i<8;$i++) {
1183 cmp `4*$i`(%rbx),%ecx # examine counters
1184 cmovge $Tbl,@ptr[$i] # cancel input
1188 vmovdqa (%rbx),$sigma # pull counters
1191 vpcmpgtd $t1,$Xn,$Xn # mask value
1192 vpaddd $Xn,$sigma,$sigma # counters--
1194 vmovdqu 0x00-0x80($ctx),$t1
1196 vmovdqu 0x20-0x80($ctx),$t2
1198 vmovdqu 0x40-0x80($ctx),$t3
1200 vmovdqu 0x60-0x80($ctx),$Xi
1203 vmovdqu 0x80-0x80($ctx),$t1
1206 vmovdqu 0xa0-0x80($ctx),$t2
1209 vmovdqu 0xc0-0x80($ctx),$t3
1212 vmovdqu 0xe0-0x80($ctx),$Xi
1216 vmovdqu $A,0x00-0x80($ctx)
1218 vmovdqu $B,0x20-0x80($ctx)
1220 vmovdqu $C,0x40-0x80($ctx)
1221 vmovdqu $D,0x60-0x80($ctx)
1222 vmovdqu $E,0x80-0x80($ctx)
1223 vmovdqu $F,0xa0-0x80($ctx)
1224 vmovdqu $G,0xc0-0x80($ctx)
1225 vmovdqu $H,0xe0-0x80($ctx)
1227 vmovdqu $sigma,(%rbx) # save counters
1228 lea 256+128(%rsp),%rbx
1229 vmovdqu .Lpbswap(%rip),$Xn
1233 #mov `$REG_SZ*17+8`(%rsp),$num
1234 #lea $REG_SZ($ctx),$ctx
1235 #lea `16*$REG_SZ/4`($inp),$inp
1237 #jnz .Loop_grande_avx2
1240 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
1243 $code.=<<___ if ($win64);
1244 movaps -0xd8(%rax),%xmm6
1245 movaps -0xc8(%rax),%xmm7
1246 movaps -0xb8(%rax),%xmm8
1247 movaps -0xa8(%rax),%xmm9
1248 movaps -0x98(%rax),%xmm10
1249 movaps -0x88(%rax),%xmm11
1250 movaps -0x78(%rax),%xmm12
1251 movaps -0x68(%rax),%xmm13
1252 movaps -0x58(%rax),%xmm14
1253 movaps -0x48(%rax),%xmm15
1265 .size sha256_multi_block_avx2,.-sha256_multi_block_avx2
1280 &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1281 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1282 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1283 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1284 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1285 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1286 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1287 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1288 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1289 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1290 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1291 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1292 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1293 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1294 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1295 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1298 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1299 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1301 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1302 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1303 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1304 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1305 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1306 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1307 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1308 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1309 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1310 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1311 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1312 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1313 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1314 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1315 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1316 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1317 .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1321 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1322 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1329 .extern __imp_RtlVirtualUnwind
1330 .type se_handler,\@abi-omnipotent
1344 mov 120($context),%rax # pull context->Rax
1345 mov 248($context),%rbx # pull context->Rip
1347 mov 8($disp),%rsi # disp->ImageBase
1348 mov 56($disp),%r11 # disp->HandlerData
1350 mov 0(%r11),%r10d # HandlerData[0]
1351 lea (%rsi,%r10),%r10 # end of prologue label
1352 cmp %r10,%rbx # context->Rip<.Lbody
1355 mov 152($context),%rax # pull context->Rsp
1357 mov 4(%r11),%r10d # HandlerData[1]
1358 lea (%rsi,%r10),%r10 # epilogue label
1359 cmp %r10,%rbx # context->Rip>=.Lepilogue
1362 mov `16*17`(%rax),%rax # pull saved stack pointer
1366 mov %rbx,144($context) # restore context->Rbx
1367 mov %rbp,160($context) # restore context->Rbp
1369 lea -24-10*16(%rax),%rsi
1370 lea 512($context),%rdi # &context.Xmm6
1372 .long 0xa548f3fc # cld; rep movsq
1377 mov %rax,152($context) # restore context->Rsp
1378 mov %rsi,168($context) # restore context->Rsi
1379 mov %rdi,176($context) # restore context->Rdi
1381 mov 40($disp),%rdi # disp->ContextRecord
1382 mov $context,%rsi # context
1383 mov \$154,%ecx # sizeof(CONTEXT)
1384 .long 0xa548f3fc # cld; rep movsq
1387 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1388 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1389 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1390 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1391 mov 40(%rsi),%r10 # disp->ContextRecord
1392 lea 56(%rsi),%r11 # &disp->HandlerData
1393 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1394 mov %r10,32(%rsp) # arg5
1395 mov %r11,40(%rsp) # arg6
1396 mov %r12,48(%rsp) # arg7
1397 mov %rcx,56(%rsp) # arg8, (NULL)
1398 call *__imp_RtlVirtualUnwind(%rip)
1400 mov \$1,%eax # ExceptionContinueSearch
1412 .size se_handler,.-se_handler
1414 $code.=<<___ if ($avx>1);
1415 .type avx2_handler,\@abi-omnipotent
1429 mov 120($context),%rax # pull context->Rax
1430 mov 248($context),%rbx # pull context->Rip
1432 mov 8($disp),%rsi # disp->ImageBase
1433 mov 56($disp),%r11 # disp->HandlerData
1435 mov 0(%r11),%r10d # HandlerData[0]
1436 lea (%rsi,%r10),%r10 # end of prologue label
1437 cmp %r10,%rbx # context->Rip<body label
1440 mov 152($context),%rax # pull context->Rsp
1442 mov 4(%r11),%r10d # HandlerData[1]
1443 lea (%rsi,%r10),%r10 # epilogue label
1444 cmp %r10,%rbx # context->Rip>=epilogue label
1447 mov `32*17`($context),%rax # pull saved stack pointer
1455 mov %rbx,144($context) # restore context->Rbx
1456 mov %rbp,160($context) # restore context->Rbp
1457 mov %r12,216($context) # restore cotnext->R12
1458 mov %r13,224($context) # restore cotnext->R13
1459 mov %r14,232($context) # restore cotnext->R14
1460 mov %r15,240($context) # restore cotnext->R15
1462 lea -56-10*16(%rax),%rsi
1463 lea 512($context),%rdi # &context.Xmm6
1465 .long 0xa548f3fc # cld; rep movsq
1468 .size avx2_handler,.-avx2_handler
1473 .rva .LSEH_begin_sha256_multi_block
1474 .rva .LSEH_end_sha256_multi_block
1475 .rva .LSEH_info_sha256_multi_block
1476 .rva .LSEH_begin_sha256_multi_block_shaext
1477 .rva .LSEH_end_sha256_multi_block_shaext
1478 .rva .LSEH_info_sha256_multi_block_shaext
1480 $code.=<<___ if ($avx);
1481 .rva .LSEH_begin_sha256_multi_block_avx
1482 .rva .LSEH_end_sha256_multi_block_avx
1483 .rva .LSEH_info_sha256_multi_block_avx
1485 $code.=<<___ if ($avx>1);
1486 .rva .LSEH_begin_sha256_multi_block_avx2
1487 .rva .LSEH_end_sha256_multi_block_avx2
1488 .rva .LSEH_info_sha256_multi_block_avx2
1493 .LSEH_info_sha256_multi_block:
1496 .rva .Lbody,.Lepilogue # HandlerData[]
1497 .LSEH_info_sha256_multi_block_shaext:
1500 .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
1502 $code.=<<___ if ($avx);
1503 .LSEH_info_sha256_multi_block_avx:
1506 .rva .Lbody_avx,.Lepilogue_avx # HandlerData[]
1508 $code.=<<___ if ($avx>1);
1509 .LSEH_info_sha256_multi_block_avx2:
1512 .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[]
1515 ####################################################################
1518 local *opcode=shift;
1522 $rex|=0x04 if ($dst>=8);
1523 $rex|=0x01 if ($src>=8);
1524 unshift @opcode,$rex|0x40 if ($rex);
1530 "sha256rnds2" => 0xcb,
1531 "sha256msg1" => 0xcc,
1532 "sha256msg2" => 0xcd );
1534 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1535 my @opcode=(0x0f,0x38);
1536 rex(\@opcode,$2,$1);
1537 push @opcode,$opcodelet{$instr};
1538 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1539 return ".byte\t".join(',',@opcode);
1541 return $instr."\t".@_[0];
1545 foreach (split("\n",$code)) {
1546 s/\`([^\`]*)\`/eval($1)/ge;
1548 s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or
1550 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1551 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
1552 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
1553 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1554 s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
1555 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;