3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # Multi-buffer SHA256 procedure processes n buffers in parallel by
11 # placing buffer data to designated lane of SIMD register. n is
12 # naturally limited to 4 on pre-AVX2 processors and to 8 on
13 # AVX2-capable processors such as Haswell.
15 # this +aesni(i) sha256 aesni-sha256 gain(iv)
16 # -------------------------------------------------------------------
17 # Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
18 # Atom(ii) ?39.1/n +3.93=13.7(n=4) 20.8 +5.69=26.5 +93%
19 # Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
20 # Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
21 # Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
22 # Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100%
24 # (i) multi-block CBC encrypt with 128-bit key;
25 # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
26 # because of lower AES-NI instruction throughput, nor is there
27 # AES-NI-SHA256 stitch for these processors;
28 # (iii) "this" is for n=8, when we gather twice as much data, result
29 # for n=4 is 20.3+4.44=24.7;
30 # (iv) presented improvement coefficients are asymptotic limits and
31 # in real-life application are somewhat lower, e.g. for 2KB
32 # fragments they range from 75% to 13% (on Haswell);
36 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
38 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
40 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
41 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
42 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
43 die "can't locate x86_64-xlate.pl";
47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49 $avx = ($1>=2.19) + ($1>=2.22);
52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54 $avx = ($1>=2.09) + ($1>=2.10);
57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59 $avx = ($1>=10) + ($1>=11);
62 open OUT,"| \"$^X\" $xlate $flavour $output";
65 # void sha256_multi_block (
66 # struct { unsigned int A[8];
73 # unsigned int H[8]; } *ctx,
74 # struct { void *ptr; int blocks; } inp[8],
75 # int num); /* 1 or 2 */
77 $ctx="%rdi"; # 1st arg
78 $inp="%rsi"; # 2nd arg
79 $num="%edx"; # 3rd arg
80 @ptr=map("%r$_",(8..11));
83 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
84 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
91 $off %= 16; $off *= $REG_SZ;
92 $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
96 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
98 $code.=<<___ if ($i<15);
99 movd `4*$i`(@ptr[0]),$Xi
100 movd `4*$i`(@ptr[1]),$t1
101 movd `4*$i`(@ptr[2]),$t2
102 movd `4*$i`(@ptr[3]),$t3
108 $code.=<<___ if ($i==15);
109 movd `4*$i`(@ptr[0]),$Xi
110 lea `16*4`(@ptr[0]),@ptr[0]
111 movd `4*$i`(@ptr[1]),$t1
112 lea `16*4`(@ptr[1]),@ptr[1]
113 movd `4*$i`(@ptr[2]),$t2
114 lea `16*4`(@ptr[2]),@ptr[2]
115 movd `4*$i`(@ptr[3]),$t3
116 lea `16*4`(@ptr[3]),@ptr[3]
128 movdqa $Xi,`&Xi_off($i)`
134 paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round]
139 `"prefetch 63(@ptr[0])" if ($i==15)`
141 movdqa $e,$axb # borrow $axb
147 `"prefetch 63(@ptr[1])" if ($i==15)`
149 pxor $t3,$sigma # Sigma1(e)
152 paddd $sigma,$Xi # Xi+=Sigma1(e)
153 pxor $axb,$t1 # Ch(e,f,g)
157 pxor $a,$axb # a^b, b^c in next round
159 `"prefetch 63(@ptr[2])" if ($i==15)`
162 paddd $t1,$Xi # Xi+=Ch(e,f,g)
167 `"prefetch 63(@ptr[3])" if ($i==15)`
173 pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
175 pxor $t3,$sigma # Sigma0(a)
178 paddd $sigma,$h # h+=Sigma0(a)
180 $code.=<<___ if (($i%8)==7);
181 lea `32*8`($Tbl),$Tbl
183 ($axb,$bxc)=($bxc,$axb);
190 movdqa `&Xi_off($i+1)`,$Xn
191 paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9]
199 movdqa `&Xi_off($i+14)`,$t1
203 movdqa $t1,$axb # borrow $axb
211 pxor $t3,$sigma # sigma0(X[i+1])
213 paddd $sigma,$Xi # Xi+=sigma0(e)
219 pxor $t2,$t1 # sigma0(X[i+14])
220 paddd $t1,$Xi # Xi+=sigma1(X[i+14])
229 .extern OPENSSL_ia32cap_P
231 .globl sha256_multi_block
232 .type sha256_multi_block,\@function,3
236 $code.=<<___ if ($avx);
237 mov OPENSSL_ia32cap_P+4(%rip),%rcx
246 $code.=<<___ if ($win64);
249 movaps %xmm7,0x10(%rsp)
250 movaps %xmm8,0x20(%rsp)
251 movaps %xmm9,0x30(%rsp)
252 movaps %xmm10,-0x78(%rax)
253 movaps %xmm11,-0x68(%rax)
254 movaps %xmm12,-0x58(%rax)
255 movaps %xmm13,-0x48(%rax)
256 movaps %xmm14,-0x38(%rax)
257 movaps %xmm15,-0x28(%rax)
260 sub \$`$REG_SZ*18`, %rsp
262 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
263 lea K256+128(%rip),$Tbl
264 lea `$REG_SZ*16`(%rsp),%rbx
265 lea 0x80($ctx),$ctx # size optimization
268 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
271 for($i=0;$i<4;$i++) {
273 mov `16*$i+0`($inp),@ptr[$i] # input pointer
274 mov `16*$i+8`($inp),%ecx # number of blocks
276 cmovg %ecx,$num # find maximum
278 mov %ecx,`4*$i`(%rbx) # initialize counters
279 cmovle $Tbl,@ptr[$i] # cancel input
286 movdqu 0x00-0x80($ctx),$A # load context
288 movdqu 0x20-0x80($ctx),$B
289 movdqu 0x40-0x80($ctx),$C
290 movdqu 0x60-0x80($ctx),$D
291 movdqu 0x80-0x80($ctx),$E
292 movdqu 0xa0-0x80($ctx),$F
293 movdqu 0xc0-0x80($ctx),$G
294 movdqu 0xe0-0x80($ctx),$H
295 movdqu .Lpbswap(%rip),$Xn
301 pxor $B,$bxc # magic seed
303 for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
305 movdqu `&Xi_off($i)`,$Xi
311 for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
317 lea K256+128(%rip),$Tbl
319 movdqa (%rbx),$sigma # pull counters
320 cmp 4*0(%rbx),%ecx # examine counters
322 cmovge $Tbl,@ptr[0] # cancel input
327 pcmpgtd $t1,$Xn # mask value
330 paddd $Xn,$sigma # counters--
333 movdqu 0x00-0x80($ctx),$t1
335 movdqu 0x20-0x80($ctx),$t2
337 movdqu 0x40-0x80($ctx),$t3
339 movdqu 0x60-0x80($ctx),$Xi
342 movdqu 0x80-0x80($ctx),$t1
345 movdqu 0xa0-0x80($ctx),$t2
348 movdqu 0xc0-0x80($ctx),$t3
351 movdqu 0xe0-0x80($ctx),$Xi
355 movdqu $A,0x00-0x80($ctx)
357 movdqu $B,0x20-0x80($ctx)
359 movdqu $C,0x40-0x80($ctx)
360 movdqu $D,0x60-0x80($ctx)
361 movdqu $E,0x80-0x80($ctx)
362 movdqu $F,0xa0-0x80($ctx)
363 movdqu $G,0xc0-0x80($ctx)
364 movdqu $H,0xe0-0x80($ctx)
366 movdqa $sigma,(%rbx) # save counters
367 movdqa .Lpbswap(%rip),$Xn
371 mov `$REG_SZ*17+8`(%rsp),$num
372 lea $REG_SZ($ctx),$ctx
373 lea `16*$REG_SZ/4`($inp),$inp
378 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
380 $code.=<<___ if ($win64);
381 movaps -0xb8(%rax),%xmm6
382 movaps -0xa8(%rax),%xmm7
383 movaps -0x98(%rax),%xmm8
384 movaps -0x88(%rax),%xmm9
385 movaps -0x78(%rax),%xmm10
386 movaps -0x68(%rax),%xmm11
387 movaps -0x58(%rax),%xmm12
388 movaps -0x48(%rax),%xmm13
389 movaps -0x38(%rax),%xmm14
390 movaps -0x28(%rax),%xmm15
397 .size sha256_multi_block,.-sha256_multi_block
400 sub ROUND_00_15_avx {
401 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
403 $code.=<<___ if ($i<15 && $REG_SZ==16);
404 vmovd `4*$i`(@ptr[0]),$Xi
405 vmovd `4*$i`(@ptr[1]),$t1
406 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
407 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
408 vpunpckldq $t1,$Xi,$Xi
411 $code.=<<___ if ($i==15 && $REG_SZ==16);
412 vmovd `4*$i`(@ptr[0]),$Xi
413 lea `16*4`(@ptr[0]),@ptr[0]
414 vmovd `4*$i`(@ptr[1]),$t1
415 lea `16*4`(@ptr[1]),@ptr[1]
416 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
417 lea `16*4`(@ptr[2]),@ptr[2]
418 vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1
419 lea `16*4`(@ptr[3]),@ptr[3]
420 vpunpckldq $t1,$Xi,$Xi
423 $code.=<<___ if ($i<15 && $REG_SZ==32);
424 vmovd `4*$i`(@ptr[0]),$Xi
425 vmovd `4*$i`(@ptr[4]),$t1
426 vmovd `4*$i`(@ptr[1]),$t2
427 vmovd `4*$i`(@ptr[5]),$t3
428 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
429 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
430 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
431 vpunpckldq $t2,$Xi,$Xi
432 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
433 vpunpckldq $t3,$t1,$t1
434 vinserti128 $t1,$Xi,$Xi
437 $code.=<<___ if ($i==15 && $REG_SZ==32);
438 vmovd `4*$i`(@ptr[0]),$Xi
439 lea `16*4`(@ptr[0]),@ptr[0]
440 vmovd `4*$i`(@ptr[4]),$t1
441 lea `16*4`(@ptr[4]),@ptr[4]
442 vmovd `4*$i`(@ptr[1]),$t2
443 lea `16*4`(@ptr[1]),@ptr[1]
444 vmovd `4*$i`(@ptr[5]),$t3
445 lea `16*4`(@ptr[5]),@ptr[5]
446 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi
447 lea `16*4`(@ptr[2]),@ptr[2]
448 vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1
449 lea `16*4`(@ptr[6]),@ptr[6]
450 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2
451 lea `16*4`(@ptr[3]),@ptr[3]
452 vpunpckldq $t2,$Xi,$Xi
453 vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3
454 lea `16*4`(@ptr[7]),@ptr[7]
455 vpunpckldq $t3,$t1,$t1
456 vinserti128 $t1,$Xi,$Xi
462 vmovdqu $Xi,`&Xi_off($i)`
463 vpaddd $h,$Xi,$Xi # Xi+=h
466 vpxor $t3,$sigma,$sigma
468 vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round]
469 vpxor $t2,$sigma,$sigma
472 vpxor $t3,$sigma,$sigma
473 `"prefetch 63(@ptr[0])" if ($i==15)`
476 vpand $f,$e,$axb # borrow $axb
477 `"prefetch 63(@ptr[1])" if ($i==15)`
478 vpxor $t2,$sigma,$sigma
480 vpsrld \$2,$a,$h # borrow $h
481 vpxor $t3,$sigma,$sigma # Sigma1(e)
482 `"prefetch 63(@ptr[2])" if ($i==15)`
484 vpxor $axb,$t1,$t1 # Ch(e,f,g)
485 vpxor $a,$b,$axb # a^b, b^c in next round
486 `"prefetch 63(@ptr[3])" if ($i==15)`
488 vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
491 `"prefetch 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
493 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
495 `"prefetch 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
499 vpxor $t3,$sigma,$sigma
500 `"prefetch 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
502 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
503 vpaddd $Xi,$d,$d # d+=Xi
504 `"prefetch 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
505 vpxor $t2,$sigma,$sigma
506 vpxor $t3,$sigma,$sigma # Sigma0(a)
508 vpaddd $Xi,$h,$h # h+=Xi
509 vpaddd $sigma,$h,$h # h+=Sigma0(a)
511 $code.=<<___ if (($i%8)==7);
514 ($axb,$bxc)=($bxc,$axb);
517 sub ROUND_16_XX_avx {
521 vmovdqu `&Xi_off($i+1)`,$Xn
522 vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9]
524 vpsrld \$3,$Xn,$sigma
527 vpxor $t2,$sigma,$sigma
529 vpxor $t3,$sigma,$sigma
531 vmovdqu `&Xi_off($i+14)`,$t1
532 vpsrld \$10,$t1,$axb # borrow $axb
534 vpxor $t2,$sigma,$sigma
536 vpxor $t3,$sigma,$sigma # sigma0(X[i+1])
538 vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e)
539 vpxor $t2,$axb,$sigma
541 vpxor $t3,$sigma,$sigma
543 vpxor $t2,$sigma,$sigma
544 vpxor $t3,$sigma,$sigma # sigma0(X[i+14])
545 vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14])
547 &ROUND_00_15_avx($i,@_);
552 .type sha256_multi_block_avx,\@function,3
554 sha256_multi_block_avx:
557 $code.=<<___ if ($avx>1);
572 $code.=<<___ if ($win64);
575 movaps %xmm7,0x10(%rsp)
576 movaps %xmm8,0x20(%rsp)
577 movaps %xmm9,0x30(%rsp)
578 movaps %xmm10,-0x78(%rax)
579 movaps %xmm11,-0x68(%rax)
580 movaps %xmm12,-0x58(%rax)
581 movaps %xmm13,-0x48(%rax)
582 movaps %xmm14,-0x38(%rax)
583 movaps %xmm15,-0x28(%rax)
586 sub \$`$REG_SZ*18`, %rsp
588 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
589 lea K256+128(%rip),$Tbl
590 lea `$REG_SZ*16`(%rsp),%rbx
591 lea 0x80($ctx),$ctx # size optimization
594 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
597 for($i=0;$i<4;$i++) {
599 mov `16*$i+0`($inp),@ptr[$i] # input pointer
600 mov `16*$i+8`($inp),%ecx # number of blocks
602 cmovg %ecx,$num # find maximum
604 mov %ecx,`4*$i`(%rbx) # initialize counters
605 cmovle $Tbl,@ptr[$i] # cancel input
612 vmovdqu 0x00-0x80($ctx),$A # load context
614 vmovdqu 0x20-0x80($ctx),$B
615 vmovdqu 0x40-0x80($ctx),$C
616 vmovdqu 0x60-0x80($ctx),$D
617 vmovdqu 0x80-0x80($ctx),$E
618 vmovdqu 0xa0-0x80($ctx),$F
619 vmovdqu 0xc0-0x80($ctx),$G
620 vmovdqu 0xe0-0x80($ctx),$H
621 vmovdqu .Lpbswap(%rip),$Xn
626 vpxor $B,$C,$bxc # magic seed
628 for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
630 vmovdqu `&Xi_off($i)`,$Xi
636 for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
642 lea K256+128(%rip),$Tbl
644 for($i=0;$i<4;$i++) {
646 cmp `4*$i`(%rbx),%ecx # examine counters
647 cmovge $Tbl,@ptr[$i] # cancel input
651 vmovdqa (%rbx),$sigma # pull counters
654 vpcmpgtd $t1,$Xn,$Xn # mask value
655 vpaddd $Xn,$sigma,$sigma # counters--
657 vmovdqu 0x00-0x80($ctx),$t1
659 vmovdqu 0x20-0x80($ctx),$t2
661 vmovdqu 0x40-0x80($ctx),$t3
663 vmovdqu 0x60-0x80($ctx),$Xi
666 vmovdqu 0x80-0x80($ctx),$t1
669 vmovdqu 0xa0-0x80($ctx),$t2
672 vmovdqu 0xc0-0x80($ctx),$t3
675 vmovdqu 0xe0-0x80($ctx),$Xi
679 vmovdqu $A,0x00-0x80($ctx)
681 vmovdqu $B,0x20-0x80($ctx)
683 vmovdqu $C,0x40-0x80($ctx)
684 vmovdqu $D,0x60-0x80($ctx)
685 vmovdqu $E,0x80-0x80($ctx)
686 vmovdqu $F,0xa0-0x80($ctx)
687 vmovdqu $G,0xc0-0x80($ctx)
688 vmovdqu $H,0xe0-0x80($ctx)
690 vmovdqu $sigma,(%rbx) # save counters
691 vmovdqu .Lpbswap(%rip),$Xn
695 mov `$REG_SZ*17+8`(%rsp),$num
696 lea $REG_SZ($ctx),$ctx
697 lea `16*$REG_SZ/4`($inp),$inp
702 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
705 $code.=<<___ if ($win64);
706 movaps -0xb8(%rax),%xmm6
707 movaps -0xa8(%rax),%xmm7
708 movaps -0x98(%rax),%xmm8
709 movaps -0x88(%rax),%xmm9
710 movaps -0x78(%rax),%xmm10
711 movaps -0x68(%rax),%xmm11
712 movaps -0x58(%rax),%xmm12
713 movaps -0x48(%rax),%xmm13
714 movaps -0x38(%rax),%xmm14
715 movaps -0x28(%rax),%xmm15
722 .size sha256_multi_block_avx,.-sha256_multi_block_avx
725 $code =~ s/\`([^\`]*)\`/eval $1/gem;
728 @ptr=map("%r$_",(12..15,8..11));
730 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
731 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
734 .type sha256_multi_block_avx2,\@function,3
736 sha256_multi_block_avx2:
746 $code.=<<___ if ($win64);
749 movaps %xmm7,0x10(%rsp)
750 movaps %xmm8,0x20(%rsp)
751 movaps %xmm9,0x30(%rsp)
752 movaps %xmm10,0x40(%rsp)
753 movaps %xmm11,0x50(%rsp)
754 movaps %xmm12,-0x78(%rax)
755 movaps %xmm13,-0x68(%rax)
756 movaps %xmm14,-0x58(%rax)
757 movaps %xmm15,-0x48(%rax)
760 sub \$`$REG_SZ*18`, %rsp
762 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
763 lea K256+128(%rip),$Tbl
764 lea 0x80($ctx),$ctx # size optimization
767 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
769 lea `$REG_SZ*16`(%rsp),%rbx
771 for($i=0;$i<8;$i++) {
773 mov `16*$i+0`($inp),@ptr[$i] # input pointer
774 mov `16*$i+8`($inp),%ecx # number of blocks
776 cmovg %ecx,$num # find maximum
778 mov %ecx,`4*$i`(%rbx) # initialize counters
779 cmovle $Tbl,@ptr[$i] # cancel input
783 vmovdqu 0x00-0x80($ctx),$A # load context
785 vmovdqu 0x20-0x80($ctx),$B
786 lea 256+128(%rsp),%rbx
787 vmovdqu 0x40-0x80($ctx),$C
788 vmovdqu 0x60-0x80($ctx),$D
789 vmovdqu 0x80-0x80($ctx),$E
790 vmovdqu 0xa0-0x80($ctx),$F
791 vmovdqu 0xc0-0x80($ctx),$G
792 vmovdqu 0xe0-0x80($ctx),$H
793 vmovdqu .Lpbswap(%rip),$Xn
798 vpxor $B,$C,$bxc # magic seed
800 for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
802 vmovdqu `&Xi_off($i)`,$Xi
808 for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
814 lea `$REG_SZ*16`(%rsp),%rbx
815 lea K256+128(%rip),$Tbl
817 for($i=0;$i<8;$i++) {
819 cmp `4*$i`(%rbx),%ecx # examine counters
820 cmovge $Tbl,@ptr[$i] # cancel input
824 vmovdqa (%rbx),$sigma # pull counters
827 vpcmpgtd $t1,$Xn,$Xn # mask value
828 vpaddd $Xn,$sigma,$sigma # counters--
830 vmovdqu 0x00-0x80($ctx),$t1
832 vmovdqu 0x20-0x80($ctx),$t2
834 vmovdqu 0x40-0x80($ctx),$t3
836 vmovdqu 0x60-0x80($ctx),$Xi
839 vmovdqu 0x80-0x80($ctx),$t1
842 vmovdqu 0xa0-0x80($ctx),$t2
845 vmovdqu 0xc0-0x80($ctx),$t3
848 vmovdqu 0xe0-0x80($ctx),$Xi
852 vmovdqu $A,0x00-0x80($ctx)
854 vmovdqu $B,0x20-0x80($ctx)
856 vmovdqu $C,0x40-0x80($ctx)
857 vmovdqu $D,0x60-0x80($ctx)
858 vmovdqu $E,0x80-0x80($ctx)
859 vmovdqu $F,0xa0-0x80($ctx)
860 vmovdqu $G,0xc0-0x80($ctx)
861 vmovdqu $H,0xe0-0x80($ctx)
863 vmovdqu $sigma,(%rbx) # save counters
864 lea 256+128(%rsp),%rbx
865 vmovdqu .Lpbswap(%rip),$Xn
869 #mov `$REG_SZ*17+8`(%rsp),$num
870 #lea $REG_SZ($ctx),$ctx
871 #lea `16*$REG_SZ/4`($inp),$inp
873 #jnz .Loop_grande_avx2
876 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
879 $code.=<<___ if ($win64);
880 movaps -0xd8(%rax),%xmm6
881 movaps -0xc8(%rax),%xmm7
882 movaps -0xb8(%rax),%xmm8
883 movaps -0xa8(%rax),%xmm9
884 movaps -0x98(%rax),%xmm10
885 movaps -0x88(%rax),%xmm11
886 movaps -0x78(%rax),%xmm12
887 movaps -0x68(%rax),%xmm13
888 movaps -0x58(%rax),%xmm14
889 movaps -0x48(%rax),%xmm15
900 .size sha256_multi_block_avx2,.-sha256_multi_block_avx2
915 &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
916 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
917 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
918 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
919 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
920 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
921 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
922 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
923 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
924 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
925 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
926 0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
927 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
928 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
929 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
930 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
933 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
934 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
937 foreach (split("\n",$code)) {
938 s/\`([^\`]*)\`/eval($1)/ge;
940 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
941 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
942 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
943 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
944 s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
945 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;