2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
21 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
22 # parallelism, interleaving it with another algorithm would allow to
23 # utilize processor resources better and achieve better performance.
24 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25 # AESNI code is weaved into it. As SHA256 dominates execution time,
26 # stitch performance does not depend on AES key length. Below are
27 # performance numbers in cycles per processed byte, less is better,
28 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
31 # AES-128/-192/-256+SHA256 this(**)gain
32 # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
33 # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
34 # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
35 # Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
36 # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
38 # (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
39 # Westmere is omitted from loop, this is because gain was not
40 # estimated high enough to justify the effort;
41 # (**) these are EVP-free results, results obtained with 'speed
42 # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
46 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
48 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
50 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
51 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
52 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
53 die "can't locate x86_64-xlate.pl";
55 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
56 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
57 $avx = ($1>=2.19) + ($1>=2.22);
60 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
61 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
62 $avx = ($1>=2.09) + ($1>=2.10);
65 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
66 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
67 $avx = ($1>=10) + ($1>=12);
70 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
71 $avx = ($2>=3.0) + ($2>3.0);
74 $shaext=$avx; ### set to zero if compiling for 1.0.1
75 $avx=1 if (!$shaext && $avx);
77 open OUT,"| \"$^X\" $xlate $flavour $output";
80 $func="aesni_cbc_sha256_enc";
83 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
84 "%r8d","%r9d","%r10d","%r11d");
85 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
92 ########################################################################
93 # void aesni_cbc_sha256_enc(const void *inp,
100 ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
101 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
105 $_inp="16*$SZ+0*8(%rsp)";
106 $_out="16*$SZ+1*8(%rsp)";
107 $_end="16*$SZ+2*8(%rsp)";
108 $_key="16*$SZ+3*8(%rsp)";
109 $_ivp="16*$SZ+4*8(%rsp)";
110 $_ctx="16*$SZ+5*8(%rsp)";
111 $_in0="16*$SZ+6*8(%rsp)";
112 $_rsp="16*$SZ+7*8(%rsp)";
118 .extern OPENSSL_ia32cap_P
120 .type $func,\@abi-omnipotent
126 lea OPENSSL_ia32cap_P(%rip),%r11
128 cmp \$0,`$win64?"%rcx":"%rdi"`
133 $code.=<<___ if ($shaext);
134 bt \$61,%r10 # check for SHA
141 test \$`1<<11`,%r10d # check for XOP
144 $code.=<<___ if ($avx>1);
145 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
146 cmp \$`1<<8|1<<5|1<<3`,%r11d
150 and \$`1<<28`,%r10d # check for AVX
157 cmp \$0,`$win64?"%rcx":"%rdi"`
165 .type $TABLE,\@object
167 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
168 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
169 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
170 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
171 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
172 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
173 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
174 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
175 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
176 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
177 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
178 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
179 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
180 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
181 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
182 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
183 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
184 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
185 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
186 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
187 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
188 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
189 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
190 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
191 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
192 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
193 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
194 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
195 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
196 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
197 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
198 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
200 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
201 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
202 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
203 .long 0,0,0,0, 0,0,0,0
204 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
208 ######################################################################
212 ($iv,$inout,$roundkey,$temp,
213 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
217 ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
218 ## &vmovdqu ($inout,($inp));
219 ## &mov ($_inp,$inp);
221 '&vpxor ($inout,$inout,$roundkey);'.
222 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
224 '&vpxor ($inout,$inout,$iv);',
226 '&vaesenc ($inout,$inout,$roundkey);'.
227 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
229 '&vaesenc ($inout,$inout,$roundkey);'.
230 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
232 '&vaesenc ($inout,$inout,$roundkey);'.
233 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
235 '&vaesenc ($inout,$inout,$roundkey);'.
236 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
238 '&vaesenc ($inout,$inout,$roundkey);'.
239 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
241 '&vaesenc ($inout,$inout,$roundkey);'.
242 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
244 '&vaesenc ($inout,$inout,$roundkey);'.
245 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
247 '&vaesenc ($inout,$inout,$roundkey);'.
248 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
250 '&vaesenc ($inout,$inout,$roundkey);'.
251 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
253 '&vaesenclast ($temp,$inout,$roundkey);'.
254 ' &vaesenc ($inout,$inout,$roundkey);'.
255 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
257 '&vpand ($iv,$temp,$mask10);'.
258 ' &vaesenc ($inout,$inout,$roundkey);'.
259 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
261 '&vaesenclast ($temp,$inout,$roundkey);'.
262 ' &vaesenc ($inout,$inout,$roundkey);'.
263 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
265 '&vpand ($temp,$temp,$mask12);'.
266 ' &vaesenc ($inout,$inout,$roundkey);'.
267 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
269 '&vpor ($iv,$iv,$temp);'.
270 ' &vaesenclast ($temp,$inout,$roundkey);'.
271 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
273 ## &mov ($inp,$_inp);
274 ## &mov ($out,$_out);
275 ## &vpand ($temp,$temp,$mask14);
276 ## &vpor ($iv,$iv,$temp);
277 ## &vmovdqu ($iv,($out,$inp);
278 ## &lea (inp,16($inp));
282 my ($a,$b,$c,$d,$e,$f,$g,$h);
284 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
285 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
287 $arg = "\$$arg" if ($arg*1 eq $arg);
288 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
293 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
295 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
300 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
301 '&xor ($a4,$g)', # f^g
303 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
305 '&and ($a4,$e)', # (f^g)&e
307 @aesni_cbc_block[$aesni_cbc_idx++].
309 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
312 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
313 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
314 '&xor ($a2,$b)', # a^b, b^c in next round
316 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
317 '&add ($h,$a4)', # h+=Ch(e,f,g)
318 '&and ($a3,$a2)', # (b^c)&(a^b)
321 '&add ($h,$a0)', # h+=Sigma1(e)
322 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
324 '&add ($d,$h)', # d+=h
325 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
326 '&add ($h,$a3)', # h+=Maj(a,b,c)
329 '&add ($a1,$h);'. # h+=Sigma0(a)
330 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
335 ######################################################################
339 .type ${func}_xop,\@function,6
343 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
350 mov %rsp,%r11 # copy %rsp
351 sub \$`$framesz+$win64*16*10`,%rsp
352 and \$-64,%rsp # align stack frame
355 sub $inp,$out # re-bias
357 add $inp,$len # end of input
359 #mov $inp,$_inp # saved later
362 #mov $key,$_key # remains resident in $inp register
368 $code.=<<___ if ($win64);
369 movaps %xmm6,`$framesz+16*0`(%rsp)
370 movaps %xmm7,`$framesz+16*1`(%rsp)
371 movaps %xmm8,`$framesz+16*2`(%rsp)
372 movaps %xmm9,`$framesz+16*3`(%rsp)
373 movaps %xmm10,`$framesz+16*4`(%rsp)
374 movaps %xmm11,`$framesz+16*5`(%rsp)
375 movaps %xmm12,`$framesz+16*6`(%rsp)
376 movaps %xmm13,`$framesz+16*7`(%rsp)
377 movaps %xmm14,`$framesz+16*8`(%rsp)
378 movaps %xmm15,`$framesz+16*9`(%rsp)
384 mov $inp,%r12 # borrow $a4
385 lea 0x80($key),$inp # size optimization, reassign
386 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
387 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
388 mov $ctx,%r15 # borrow $a2
389 mov $in0,%rsi # borrow $a3
390 vmovdqu ($ivp),$iv # load IV
402 vmovdqa 0x00(%r13,%r14,8),$mask14
403 vmovdqa 0x10(%r13,%r14,8),$mask12
404 vmovdqa 0x20(%r13,%r14,8),$mask10
405 vmovdqu 0x00-0x80($inp),$roundkey
408 if ($SZ==4) { # SHA256
409 my @X = map("%xmm$_",(0..3));
410 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
415 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
416 vmovdqu 0x00(%rsi,%r12),@X[0]
417 vmovdqu 0x10(%rsi,%r12),@X[1]
418 vmovdqu 0x20(%rsi,%r12),@X[2]
419 vmovdqu 0x30(%rsi,%r12),@X[3]
420 vpshufb $t3,@X[0],@X[0]
421 lea $TABLE(%rip),$Tbl
422 vpshufb $t3,@X[1],@X[1]
423 vpshufb $t3,@X[2],@X[2]
424 vpaddd 0x00($Tbl),@X[0],$t0
425 vpshufb $t3,@X[3],@X[3]
426 vpaddd 0x20($Tbl),@X[1],$t1
427 vpaddd 0x40($Tbl),@X[2],$t2
428 vpaddd 0x60($Tbl),@X[3],$t3
429 vmovdqa $t0,0x00(%rsp)
431 vmovdqa $t1,0x10(%rsp)
433 vmovdqa $t2,0x20(%rsp)
435 vmovdqa $t3,0x30(%rsp)
441 sub \$-16*2*$SZ,$Tbl # size optimization
442 vmovdqu (%r12),$inout # $a4
445 sub XOP_256_00_47 () {
449 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
451 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
454 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
457 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
460 &vpsrld ($t0,$t0,$sigma0[2]);
463 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
468 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
471 &vpxor ($t0,$t0,$t1);
476 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
479 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
482 &vpsrld ($t2,@X[3],$sigma1[2]);
485 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
488 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
491 &vpxor ($t3,$t3,$t2);
496 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
501 &vpsrldq ($t3,$t3,8);
506 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
511 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
514 &vpsrld ($t2,@X[0],$sigma1[2]);
517 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
520 &vpxor ($t3,$t3,$t2);
525 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
530 &vpslldq ($t3,$t3,8); # 22 instructions
535 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
540 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
541 foreach (@insns) { eval; } # remaining instructions
542 &vmovdqa (16*$j."(%rsp)",$t2);
546 for ($i=0,$j=0; $j<4; $j++) {
547 &XOP_256_00_47($j,\&body_00_15,@X);
548 push(@X,shift(@X)); # rotate(@X)
550 &mov ("%r12",$_inp); # borrow $a4
551 &vpand ($temp,$temp,$mask14);
552 &mov ("%r15",$_out); # borrow $a2
553 &vpor ($iv,$iv,$temp);
554 &vmovdqu ("(%r15,%r12)",$iv); # write output
555 &lea ("%r12","16(%r12)"); # inp++
557 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
558 &jne (".Lxop_00_47");
560 &vmovdqu ($inout,"(%r12)");
564 for ($i=0; $i<16; ) {
565 foreach(body_00_15()) { eval; }
569 mov $_inp,%r12 # borrow $a4
570 mov $_out,%r13 # borrow $a0
571 mov $_ctx,%r15 # borrow $a2
572 mov $_in0,%rsi # borrow $a3
574 vpand $mask14,$temp,$temp
577 vmovdqu $iv,(%r13,%r12) # write output
578 lea 16(%r12),%r12 # inp++
604 vmovdqu $iv,($ivp) # output IV
607 $code.=<<___ if ($win64);
608 movaps `$framesz+16*0`(%rsp),%xmm6
609 movaps `$framesz+16*1`(%rsp),%xmm7
610 movaps `$framesz+16*2`(%rsp),%xmm8
611 movaps `$framesz+16*3`(%rsp),%xmm9
612 movaps `$framesz+16*4`(%rsp),%xmm10
613 movaps `$framesz+16*5`(%rsp),%xmm11
614 movaps `$framesz+16*6`(%rsp),%xmm12
615 movaps `$framesz+16*7`(%rsp),%xmm13
616 movaps `$framesz+16*8`(%rsp),%xmm14
617 movaps `$framesz+16*9`(%rsp),%xmm15
629 .size ${func}_xop,.-${func}_xop
631 ######################################################################
634 local *ror = sub { &shrd(@_[0],@_) };
637 .type ${func}_avx,\@function,6
641 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
648 mov %rsp,%r11 # copy %rsp
649 sub \$`$framesz+$win64*16*10`,%rsp
650 and \$-64,%rsp # align stack frame
653 sub $inp,$out # re-bias
655 add $inp,$len # end of input
657 #mov $inp,$_inp # saved later
660 #mov $key,$_key # remains resident in $inp register
666 $code.=<<___ if ($win64);
667 movaps %xmm6,`$framesz+16*0`(%rsp)
668 movaps %xmm7,`$framesz+16*1`(%rsp)
669 movaps %xmm8,`$framesz+16*2`(%rsp)
670 movaps %xmm9,`$framesz+16*3`(%rsp)
671 movaps %xmm10,`$framesz+16*4`(%rsp)
672 movaps %xmm11,`$framesz+16*5`(%rsp)
673 movaps %xmm12,`$framesz+16*6`(%rsp)
674 movaps %xmm13,`$framesz+16*7`(%rsp)
675 movaps %xmm14,`$framesz+16*8`(%rsp)
676 movaps %xmm15,`$framesz+16*9`(%rsp)
682 mov $inp,%r12 # borrow $a4
683 lea 0x80($key),$inp # size optimization, reassign
684 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
685 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
686 mov $ctx,%r15 # borrow $a2
687 mov $in0,%rsi # borrow $a3
688 vmovdqu ($ivp),$iv # load IV
700 vmovdqa 0x00(%r13,%r14,8),$mask14
701 vmovdqa 0x10(%r13,%r14,8),$mask12
702 vmovdqa 0x20(%r13,%r14,8),$mask10
703 vmovdqu 0x00-0x80($inp),$roundkey
705 if ($SZ==4) { # SHA256
706 my @X = map("%xmm$_",(0..3));
707 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
713 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
714 vmovdqu 0x00(%rsi,%r12),@X[0]
715 vmovdqu 0x10(%rsi,%r12),@X[1]
716 vmovdqu 0x20(%rsi,%r12),@X[2]
717 vmovdqu 0x30(%rsi,%r12),@X[3]
718 vpshufb $t3,@X[0],@X[0]
719 lea $TABLE(%rip),$Tbl
720 vpshufb $t3,@X[1],@X[1]
721 vpshufb $t3,@X[2],@X[2]
722 vpaddd 0x00($Tbl),@X[0],$t0
723 vpshufb $t3,@X[3],@X[3]
724 vpaddd 0x20($Tbl),@X[1],$t1
725 vpaddd 0x40($Tbl),@X[2],$t2
726 vpaddd 0x60($Tbl),@X[3],$t3
727 vmovdqa $t0,0x00(%rsp)
729 vmovdqa $t1,0x10(%rsp)
731 vmovdqa $t2,0x20(%rsp)
733 vmovdqa $t3,0x30(%rsp)
739 sub \$-16*2*$SZ,$Tbl # size optimization
740 vmovdqu (%r12),$inout # $a4
743 sub Xupdate_256_AVX () {
745 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
746 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
747 '&vpsrld ($t2,$t0,$sigma0[0]);',
748 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
749 '&vpsrld ($t3,$t0,$sigma0[2])',
750 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
751 '&vpxor ($t0,$t3,$t2)',
752 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
753 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
754 '&vpxor ($t0,$t0,$t1)',
755 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
756 '&vpxor ($t0,$t0,$t2)',
757 '&vpsrld ($t2,$t3,$sigma1[2]);',
758 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
759 '&vpsrlq ($t3,$t3,$sigma1[0]);',
760 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
761 '&vpxor ($t2,$t2,$t3);',
762 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
763 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
764 '&vpshufd ($t2,$t2,0b10000100)',
765 '&vpsrldq ($t2,$t2,8)',
766 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
767 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
768 '&vpsrld ($t2,$t3,$sigma1[2])',
769 '&vpsrlq ($t3,$t3,$sigma1[0])',
770 '&vpxor ($t2,$t2,$t3);',
771 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
772 '&vpxor ($t2,$t2,$t3)',
773 '&vpshufd ($t2,$t2,0b11101000)',
774 '&vpslldq ($t2,$t2,8)',
775 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
779 sub AVX_256_00_47 () {
783 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
785 foreach (Xupdate_256_AVX()) { # 29 instructions
791 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
792 foreach (@insns) { eval; } # remaining instructions
793 &vmovdqa (16*$j."(%rsp)",$t2);
797 for ($i=0,$j=0; $j<4; $j++) {
798 &AVX_256_00_47($j,\&body_00_15,@X);
799 push(@X,shift(@X)); # rotate(@X)
801 &mov ("%r12",$_inp); # borrow $a4
802 &vpand ($temp,$temp,$mask14);
803 &mov ("%r15",$_out); # borrow $a2
804 &vpor ($iv,$iv,$temp);
805 &vmovdqu ("(%r15,%r12)",$iv); # write output
806 &lea ("%r12","16(%r12)"); # inp++
808 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
809 &jne (".Lavx_00_47");
811 &vmovdqu ($inout,"(%r12)");
815 for ($i=0; $i<16; ) {
816 foreach(body_00_15()) { eval; }
821 mov $_inp,%r12 # borrow $a4
822 mov $_out,%r13 # borrow $a0
823 mov $_ctx,%r15 # borrow $a2
824 mov $_in0,%rsi # borrow $a3
826 vpand $mask14,$temp,$temp
829 vmovdqu $iv,(%r13,%r12) # write output
830 lea 16(%r12),%r12 # inp++
855 vmovdqu $iv,($ivp) # output IV
858 $code.=<<___ if ($win64);
859 movaps `$framesz+16*0`(%rsp),%xmm6
860 movaps `$framesz+16*1`(%rsp),%xmm7
861 movaps `$framesz+16*2`(%rsp),%xmm8
862 movaps `$framesz+16*3`(%rsp),%xmm9
863 movaps `$framesz+16*4`(%rsp),%xmm10
864 movaps `$framesz+16*5`(%rsp),%xmm11
865 movaps `$framesz+16*6`(%rsp),%xmm12
866 movaps `$framesz+16*7`(%rsp),%xmm13
867 movaps `$framesz+16*8`(%rsp),%xmm14
868 movaps `$framesz+16*9`(%rsp),%xmm15
880 .size ${func}_avx,.-${func}_avx
884 ######################################################################
887 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
892 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
894 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
896 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
897 '&and ($a4,$e)', # f&e
898 '&rorx ($a0,$e,$Sigma1[2])',
899 '&rorx ($a2,$e,$Sigma1[1])',
901 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
902 '&lea ($h,"($h,$a4)")',
903 '&andn ($a4,$e,$g)', # ~e&g
906 '&rorx ($a1,$e,$Sigma1[0])',
907 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
908 '&xor ($a0,$a1)', # Sigma1(e)
911 '&rorx ($a4,$a,$Sigma0[2])',
912 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
913 '&xor ($a2,$b)', # a^b, b^c in next round
914 '&rorx ($a1,$a,$Sigma0[1])',
916 '&rorx ($a0,$a,$Sigma0[0])',
917 '&lea ($d,"($d,$h)")', # d+=h
918 '&and ($a3,$a2)', # (b^c)&(a^b)
919 @aesni_cbc_block[$aesni_cbc_idx++].
922 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
923 '&xor ($a1,$a0)', # Sigma0(a)
924 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
925 '&mov ($a4,$e)', # copy of f in future
927 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
929 # and at the finish one has to $a+=$a1
933 .type ${func}_avx2,\@function,6
937 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
944 mov %rsp,%r11 # copy %rsp
945 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
946 and \$-256*$SZ,%rsp # align stack frame
947 add \$`2*$SZ*($rounds-8)`,%rsp
950 sub $inp,$out # re-bias
952 add $inp,$len # end of input
954 #mov $inp,$_inp # saved later
955 #mov $out,$_out # kept in $offload
957 #mov $key,$_key # remains resident in $inp register
963 $code.=<<___ if ($win64);
964 movaps %xmm6,`$framesz+16*0`(%rsp)
965 movaps %xmm7,`$framesz+16*1`(%rsp)
966 movaps %xmm8,`$framesz+16*2`(%rsp)
967 movaps %xmm9,`$framesz+16*3`(%rsp)
968 movaps %xmm10,`$framesz+16*4`(%rsp)
969 movaps %xmm11,`$framesz+16*5`(%rsp)
970 movaps %xmm12,`$framesz+16*6`(%rsp)
971 movaps %xmm13,`$framesz+16*7`(%rsp)
972 movaps %xmm14,`$framesz+16*8`(%rsp)
973 movaps %xmm15,`$framesz+16*9`(%rsp)
979 mov $inp,%r13 # borrow $a0
980 vpinsrq \$1,$out,$offload,$offload
981 lea 0x80($key),$inp # size optimization, reassign
982 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
983 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
984 mov $ctx,%r15 # borrow $a2
985 mov $in0,%rsi # borrow $a3
986 vmovdqu ($ivp),$iv # load IV
989 vmovdqa 0x00(%r12,%r14,8),$mask14
990 vmovdqa 0x10(%r12,%r14,8),$mask12
991 vmovdqa 0x20(%r12,%r14,8),$mask10
993 sub \$-16*$SZ,%r13 # inp++, size optimization
995 lea (%rsi,%r13),%r12 # borrow $a0
997 cmp $len,%r13 # $_end
999 cmove %rsp,%r12 # next block or random data
1005 vmovdqu 0x00-0x80($inp),$roundkey
1007 if ($SZ==4) { # SHA256
1008 my @X = map("%ymm$_",(0..3));
1009 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1015 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1016 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1017 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1018 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1019 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1021 vinserti128 \$1,(%r12),@X[0],@X[0]
1022 vinserti128 \$1,16(%r12),@X[1],@X[1]
1023 vpshufb $t3,@X[0],@X[0]
1024 vinserti128 \$1,32(%r12),@X[2],@X[2]
1025 vpshufb $t3,@X[1],@X[1]
1026 vinserti128 \$1,48(%r12),@X[3],@X[3]
1028 lea $TABLE(%rip),$Tbl
1029 vpshufb $t3,@X[2],@X[2]
1030 lea -16*$SZ(%r13),%r13
1031 vpaddd 0x00($Tbl),@X[0],$t0
1032 vpshufb $t3,@X[3],@X[3]
1033 vpaddd 0x20($Tbl),@X[1],$t1
1034 vpaddd 0x40($Tbl),@X[2],$t2
1035 vpaddd 0x60($Tbl),@X[3],$t3
1036 vmovdqa $t0,0x00(%rsp)
1038 vmovdqa $t1,0x20(%rsp)
1039 lea -$PUSH8(%rsp),%rsp
1041 vmovdqa $t2,0x00(%rsp)
1043 vmovdqa $t3,0x20(%rsp)
1045 sub \$-16*2*$SZ,$Tbl # size optimization
1050 vmovdqu (%r13),$inout
1051 vpinsrq \$0,%r13,$offload,$offload
1054 sub AVX2_256_00_47 () {
1058 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1059 my $base = "+2*$PUSH8(%rsp)";
1061 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1062 foreach (Xupdate_256_AVX()) { # 29 instructions
1064 eval(shift(@insns));
1065 eval(shift(@insns));
1066 eval(shift(@insns));
1068 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1069 foreach (@insns) { eval; } # remaining instructions
1070 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1073 for ($i=0,$j=0; $j<4; $j++) {
1074 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1075 push(@X,shift(@X)); # rotate(@X)
1077 &vmovq ("%r13",$offload); # borrow $a0
1078 &vpextrq ("%r15",$offload,1); # borrow $a2
1079 &vpand ($temp,$temp,$mask14);
1080 &vpor ($iv,$iv,$temp);
1081 &vmovdqu ("(%r15,%r13)",$iv); # write output
1082 &lea ("%r13","16(%r13)"); # inp++
1084 &lea ($Tbl,16*2*$SZ."($Tbl)");
1085 &cmpb (($SZ-1)."($Tbl)",0);
1086 &jne (".Lavx2_00_47");
1088 &vmovdqu ($inout,"(%r13)");
1089 &vpinsrq ($offload,$offload,"%r13",0);
1092 for ($i=0; $i<16; ) {
1093 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1094 foreach(bodyx_00_15()) { eval; }
1098 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1099 vmovq $offload,%r13 # $_inp, borrow $a0
1100 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1102 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1104 vpand $mask14,$temp,$temp
1106 vmovdqu $iv,(%r12,%r13) # write output
1127 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1137 vmovdqu (%r13),$inout
1138 vpinsrq \$0,%r13,$offload,$offload
1141 for ($i=0; $i<16; ) {
1142 my $base="+16($Tbl)";
1143 foreach(bodyx_00_15()) { eval; }
1144 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1147 vmovq $offload,%r13 # borrow $a0
1148 vpextrq \$1,$offload,%r15 # borrow $a2
1149 vpand $mask14,$temp,$temp
1151 lea -$PUSH8($Tbl),$Tbl
1152 vmovdqu $iv,(%r15,%r13) # write output
1153 lea 16(%r13),%r13 # inp++
1157 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1158 lea 16*$SZ(%r13),%r13
1159 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1161 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1170 lea (%rsi,%r13),%r12
1176 cmove %rsp,%r12 # next block or stale data
1192 vmovdqu $iv,($ivp) # output IV
1195 $code.=<<___ if ($win64);
1196 movaps `$framesz+16*0`(%rsp),%xmm6
1197 movaps `$framesz+16*1`(%rsp),%xmm7
1198 movaps `$framesz+16*2`(%rsp),%xmm8
1199 movaps `$framesz+16*3`(%rsp),%xmm9
1200 movaps `$framesz+16*4`(%rsp),%xmm10
1201 movaps `$framesz+16*5`(%rsp),%xmm11
1202 movaps `$framesz+16*6`(%rsp),%xmm12
1203 movaps `$framesz+16*7`(%rsp),%xmm13
1204 movaps `$framesz+16*8`(%rsp),%xmm14
1205 movaps `$framesz+16*9`(%rsp),%xmm15
1217 .size ${func}_avx2,.-${func}_avx2
1222 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1224 my ($rounds,$Tbl)=("%r11d","%rbx");
1226 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1227 my @rndkey=("%xmm4","%xmm5");
1231 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1232 my @MSG=map("%xmm$_",(10..13));
1236 my ($n,$k)=($r/10,$r%10);
1239 movups `16*$n`($in0),$in # load input
1242 $code.=<<___ if ($n);
1243 movups $iv,`16*($n-1)`($out,$in0) # write output
1247 movups `32+16*$k-112`($key),$rndkey[1]
1248 aesenc $rndkey[0],$iv
1255 movups `32+16*($k+0)-112`($key),$rndkey[1]
1256 aesenc $rndkey[0],$iv
1257 movups `32+16*($k+1)-112`($key),$rndkey[0]
1258 aesenc $rndkey[1],$iv
1260 movups `32+16*($k+2)-112`($key),$rndkey[1]
1261 aesenc $rndkey[0],$iv
1262 movups `32+16*($k+3)-112`($key),$rndkey[0]
1263 aesenc $rndkey[1],$iv
1265 aesenclast $rndkey[0],$iv
1266 movups 16-112($key),$rndkey[1] # forward reference
1271 movups `32+16*$k-112`($key),$rndkey[1]
1272 aesenc $rndkey[0],$iv
1275 $r++; unshift(@rndkey,pop(@rndkey));
1282 .type ${func}_shaext,\@function,6
1285 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
1287 $code.=<<___ if ($win64);
1288 lea `-8-10*16`(%rsp),%rsp
1289 movaps %xmm6,-8-10*16(%rax)
1290 movaps %xmm7,-8-9*16(%rax)
1291 movaps %xmm8,-8-8*16(%rax)
1292 movaps %xmm9,-8-7*16(%rax)
1293 movaps %xmm10,-8-6*16(%rax)
1294 movaps %xmm11,-8-5*16(%rax)
1295 movaps %xmm12,-8-4*16(%rax)
1296 movaps %xmm13,-8-3*16(%rax)
1297 movaps %xmm14,-8-2*16(%rax)
1298 movaps %xmm15,-8-1*16(%rax)
1302 lea K256+0x80(%rip),$Tbl
1303 movdqu ($ctx),$ABEF # DCBA
1304 movdqu 16($ctx),$CDGH # HGFE
1305 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1307 mov 240($key),$rounds
1309 movups ($key),$rndkey0 # $key[0]
1310 movups 16($key),$rndkey[0] # forward reference
1311 lea 112($key),$key # size optimization
1313 pshufd \$0x1b,$ABEF,$Wi # ABCD
1314 pshufd \$0xb1,$ABEF,$ABEF # CDAB
1315 pshufd \$0x1b,$CDGH,$CDGH # EFGH
1316 movdqa $TMP,$BSWAP # offload
1317 palignr \$8,$CDGH,$ABEF # ABEF
1318 punpcklqdq $Wi,$CDGH # CDGH
1324 movdqu ($inp),@MSG[0]
1325 movdqu 0x10($inp),@MSG[1]
1326 movdqu 0x20($inp),@MSG[2]
1328 movdqu 0x30($inp),@MSG[3]
1330 movdqa 0*32-0x80($Tbl),$Wi
1333 movdqa $CDGH,$CDGH_SAVE # offload
1334 movdqa $ABEF,$ABEF_SAVE # offload
1338 sha256rnds2 $ABEF,$CDGH # 0-3
1339 pshufd \$0x0e,$Wi,$Wi
1343 sha256rnds2 $CDGH,$ABEF
1345 movdqa 1*32-0x80($Tbl),$Wi
1352 sha256rnds2 $ABEF,$CDGH # 4-7
1353 pshufd \$0x0e,$Wi,$Wi
1357 sha256rnds2 $CDGH,$ABEF
1359 movdqa 2*32-0x80($Tbl),$Wi
1362 sha256msg1 @MSG[1],@MSG[0]
1366 sha256rnds2 $ABEF,$CDGH # 8-11
1367 pshufd \$0x0e,$Wi,$Wi
1369 palignr \$4,@MSG[2],$TMP
1374 sha256rnds2 $CDGH,$ABEF
1376 movdqa 3*32-0x80($Tbl),$Wi
1378 sha256msg2 @MSG[3],@MSG[0]
1379 sha256msg1 @MSG[2],@MSG[1]
1383 sha256rnds2 $ABEF,$CDGH # 12-15
1384 pshufd \$0x0e,$Wi,$Wi
1389 palignr \$4,@MSG[3],$TMP
1391 sha256rnds2 $CDGH,$ABEF
1393 for($i=4;$i<16-3;$i++) {
1394 &$aesenc() if (($r%10)==0);
1396 movdqa $i*32-0x80($Tbl),$Wi
1398 sha256msg2 @MSG[0],@MSG[1]
1399 sha256msg1 @MSG[3],@MSG[2]
1403 sha256rnds2 $ABEF,$CDGH # 16-19...
1404 pshufd \$0x0e,$Wi,$Wi
1406 palignr \$4,@MSG[0],$TMP
1410 &$aesenc() if ($r==19);
1412 sha256rnds2 $CDGH,$ABEF
1414 push(@MSG,shift(@MSG));
1417 movdqa 13*32-0x80($Tbl),$Wi
1419 sha256msg2 @MSG[0],@MSG[1]
1420 sha256msg1 @MSG[3],@MSG[2]
1424 sha256rnds2 $ABEF,$CDGH # 52-55
1425 pshufd \$0x0e,$Wi,$Wi
1427 palignr \$4,@MSG[0],$TMP
1433 sha256rnds2 $CDGH,$ABEF
1435 movdqa 14*32-0x80($Tbl),$Wi
1437 sha256msg2 @MSG[1],@MSG[2]
1442 sha256rnds2 $ABEF,$CDGH # 56-59
1443 pshufd \$0x0e,$Wi,$Wi
1447 sha256rnds2 $CDGH,$ABEF
1449 movdqa 15*32-0x80($Tbl),$Wi
1455 sha256rnds2 $ABEF,$CDGH # 60-63
1456 pshufd \$0x0e,$Wi,$Wi
1460 sha256rnds2 $CDGH,$ABEF
1461 #pxor $CDGH,$rndkey0 # black magic
1463 while ($r<40) { &$aesenc(); } # remaining aesenc's
1465 #xorps $CDGH,$rndkey0 # black magic
1466 paddd $CDGH_SAVE,$CDGH
1467 paddd $ABEF_SAVE,$ABEF
1470 movups $iv,48($out,$in0) # write output
1474 pshufd \$0xb1,$CDGH,$CDGH # DCHG
1475 pshufd \$0x1b,$ABEF,$TMP # FEBA
1476 pshufd \$0xb1,$ABEF,$ABEF # BAFE
1477 punpckhqdq $CDGH,$ABEF # DCBA
1478 palignr \$8,$TMP,$CDGH # HGFE
1480 movups $iv,($ivp) # write IV
1482 movdqu $CDGH,16($ctx)
1484 $code.=<<___ if ($win64);
1485 movaps 0*16(%rsp),%xmm6
1486 movaps 1*16(%rsp),%xmm7
1487 movaps 2*16(%rsp),%xmm8
1488 movaps 3*16(%rsp),%xmm9
1489 movaps 4*16(%rsp),%xmm10
1490 movaps 5*16(%rsp),%xmm11
1491 movaps 6*16(%rsp),%xmm12
1492 movaps 7*16(%rsp),%xmm13
1493 movaps 8*16(%rsp),%xmm14
1494 movaps 9*16(%rsp),%xmm15
1495 lea 8+10*16(%rsp),%rsp
1500 .size ${func}_shaext,.-${func}_shaext
1505 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1506 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1507 if ($win64 && $avx) {
1514 .extern __imp_RtlVirtualUnwind
1515 .type se_handler,\@abi-omnipotent
1529 mov 120($context),%rax # pull context->Rax
1530 mov 248($context),%rbx # pull context->Rip
1532 mov 8($disp),%rsi # disp->ImageBase
1533 mov 56($disp),%r11 # disp->HanderlData
1535 mov 0(%r11),%r10d # HandlerData[0]
1536 lea (%rsi,%r10),%r10 # prologue label
1537 cmp %r10,%rbx # context->Rip<prologue label
1540 mov 152($context),%rax # pull context->Rsp
1542 mov 4(%r11),%r10d # HandlerData[1]
1543 lea (%rsi,%r10),%r10 # epilogue label
1544 cmp %r10,%rbx # context->Rip>=epilogue label
1547 $code.=<<___ if ($shaext);
1548 lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1553 lea 512($context),%rdi # &context.Xmm6
1555 .long 0xa548f3fc # cld; rep movsq
1556 lea 168(%rax),%rax # adjust stack pointer
1560 $code.=<<___ if ($avx>1);
1561 lea .Lavx2_shortcut(%rip),%r10
1562 cmp %r10,%rbx # context->Rip<avx2_shortcut
1566 add \$`2*$SZ*($rounds-8)`,%rax
1570 mov %rax,%rsi # put aside Rsp
1571 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1580 mov %rbx,144($context) # restore context->Rbx
1581 mov %rbp,160($context) # restore context->Rbp
1582 mov %r12,216($context) # restore context->R12
1583 mov %r13,224($context) # restore context->R13
1584 mov %r14,232($context) # restore context->R14
1585 mov %r15,240($context) # restore context->R15
1587 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1588 lea 512($context),%rdi # &context.Xmm6
1590 .long 0xa548f3fc # cld; rep movsq
1595 mov %rax,152($context) # restore context->Rsp
1596 mov %rsi,168($context) # restore context->Rsi
1597 mov %rdi,176($context) # restore context->Rdi
1599 mov 40($disp),%rdi # disp->ContextRecord
1600 mov $context,%rsi # context
1601 mov \$154,%ecx # sizeof(CONTEXT)
1602 .long 0xa548f3fc # cld; rep movsq
1605 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1606 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1607 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1608 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1609 mov 40(%rsi),%r10 # disp->ContextRecord
1610 lea 56(%rsi),%r11 # &disp->HandlerData
1611 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1612 mov %r10,32(%rsp) # arg5
1613 mov %r11,40(%rsp) # arg6
1614 mov %r12,48(%rsp) # arg7
1615 mov %rcx,56(%rsp) # arg8, (NULL)
1616 call *__imp_RtlVirtualUnwind(%rip)
1618 mov \$1,%eax # ExceptionContinueSearch
1630 .size se_handler,.-se_handler
1633 .rva .LSEH_begin_${func}_xop
1634 .rva .LSEH_end_${func}_xop
1635 .rva .LSEH_info_${func}_xop
1637 .rva .LSEH_begin_${func}_avx
1638 .rva .LSEH_end_${func}_avx
1639 .rva .LSEH_info_${func}_avx
1641 $code.=<<___ if ($avx>1);
1642 .rva .LSEH_begin_${func}_avx2
1643 .rva .LSEH_end_${func}_avx2
1644 .rva .LSEH_info_${func}_avx2
1646 $code.=<<___ if ($shaext);
1647 .rva .LSEH_begin_${func}_shaext
1648 .rva .LSEH_end_${func}_shaext
1649 .rva .LSEH_info_${func}_shaext
1654 .LSEH_info_${func}_xop:
1657 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1659 .LSEH_info_${func}_avx:
1662 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1664 $code.=<<___ if ($avx>1);
1665 .LSEH_info_${func}_avx2:
1668 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1670 $code.=<<___ if ($shaext);
1671 .LSEH_info_${func}_shaext:
1674 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1678 ####################################################################
1680 local *opcode=shift;
1684 $rex|=0x04 if($dst>=8);
1685 $rex|=0x01 if($src>=8);
1686 unshift @opcode,$rex|0x40 if($rex);
1691 "sha256rnds2" => 0xcb,
1692 "sha256msg1" => 0xcc,
1693 "sha256msg2" => 0xcd );
1698 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1699 my @opcode=(0x0f,0x38);
1700 rex(\@opcode,$2,$1);
1701 push @opcode,$opcodelet{$instr};
1702 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1703 return ".byte\t".join(',',@opcode);
1705 return $instr."\t".@_[0];
1710 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1711 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;