3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
15 # parallelism, interleaving it with another algorithm would allow to
16 # utilize processor resources better and achieve better performance.
17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18 # AESNI code is weaved into it. As SHA256 dominates execution time,
19 # stitch performance does not depend on AES key length. Below are
20 # performance numbers in cycles per processed byte, less is better,
21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
24 # AES-128/-192/-256+SHA256 this(**)gain
25 # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
26 # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
27 # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
28 # Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
29 # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
31 # (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
32 # Westmere is omitted from loop, this is because gain was not
33 # estimated high enough to justify the effort;
34 # (**) these are EVP-free results, results obtained with 'speed
35 # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
39 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
41 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
43 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
44 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
45 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
46 die "can't locate x86_64-xlate.pl";
48 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
49 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
50 $avx = ($1>=2.19) + ($1>=2.22);
53 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
54 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
55 $avx = ($1>=2.09) + ($1>=2.10);
58 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
59 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
60 $avx = ($1>=10) + ($1>=12);
63 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
64 $avx = ($2>=3.0) + ($2>3.0);
67 $shaext=$avx; ### set to zero if compiling for 1.0.1
68 $avx=1 if (!$shaext && $avx);
70 open OUT,"| \"$^X\" $xlate $flavour $output";
73 $func="aesni_cbc_sha256_enc";
76 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
77 "%r8d","%r9d","%r10d","%r11d");
78 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
85 ########################################################################
86 # void aesni_cbc_sha256_enc(const void *inp,
93 ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
94 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
98 $_inp="16*$SZ+0*8(%rsp)";
99 $_out="16*$SZ+1*8(%rsp)";
100 $_end="16*$SZ+2*8(%rsp)";
101 $_key="16*$SZ+3*8(%rsp)";
102 $_ivp="16*$SZ+4*8(%rsp)";
103 $_ctx="16*$SZ+5*8(%rsp)";
104 $_in0="16*$SZ+6*8(%rsp)";
105 $_rsp="16*$SZ+7*8(%rsp)";
111 .extern OPENSSL_ia32cap_P
113 .type $func,\@abi-omnipotent
119 lea OPENSSL_ia32cap_P(%rip),%r11
121 cmp \$0,`$win64?"%rcx":"%rdi"`
126 $code.=<<___ if ($shaext);
127 bt \$61,%r10 # check for SHA
134 test \$`1<<11`,%r10d # check for XOP
137 $code.=<<___ if ($avx>1);
138 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
139 cmp \$`1<<8|1<<5|1<<3`,%r11d
143 and \$`1<<28`,%r10d # check for AVX
150 cmp \$0,`$win64?"%rcx":"%rdi"`
158 .type $TABLE,\@object
160 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
161 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
162 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
163 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
164 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
165 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
166 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
167 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
168 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
169 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
170 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
171 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
172 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
173 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
174 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
175 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
176 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
177 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
178 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
179 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
180 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
181 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
182 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
183 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
184 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
185 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
186 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
187 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
188 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
189 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
190 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
191 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
193 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
194 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
195 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
196 .long 0,0,0,0, 0,0,0,0
197 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
201 ######################################################################
205 ($iv,$inout,$roundkey,$temp,
206 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
210 ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
211 ## &vmovdqu ($inout,($inp));
212 ## &mov ($_inp,$inp);
214 '&vpxor ($inout,$inout,$roundkey);'.
215 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
217 '&vpxor ($inout,$inout,$iv);',
219 '&vaesenc ($inout,$inout,$roundkey);'.
220 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
222 '&vaesenc ($inout,$inout,$roundkey);'.
223 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
225 '&vaesenc ($inout,$inout,$roundkey);'.
226 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
228 '&vaesenc ($inout,$inout,$roundkey);'.
229 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
231 '&vaesenc ($inout,$inout,$roundkey);'.
232 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
234 '&vaesenc ($inout,$inout,$roundkey);'.
235 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
237 '&vaesenc ($inout,$inout,$roundkey);'.
238 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
240 '&vaesenc ($inout,$inout,$roundkey);'.
241 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
243 '&vaesenc ($inout,$inout,$roundkey);'.
244 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
246 '&vaesenclast ($temp,$inout,$roundkey);'.
247 ' &vaesenc ($inout,$inout,$roundkey);'.
248 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
250 '&vpand ($iv,$temp,$mask10);'.
251 ' &vaesenc ($inout,$inout,$roundkey);'.
252 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
254 '&vaesenclast ($temp,$inout,$roundkey);'.
255 ' &vaesenc ($inout,$inout,$roundkey);'.
256 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
258 '&vpand ($temp,$temp,$mask12);'.
259 ' &vaesenc ($inout,$inout,$roundkey);'.
260 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
262 '&vpor ($iv,$iv,$temp);'.
263 ' &vaesenclast ($temp,$inout,$roundkey);'.
264 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
266 ## &mov ($inp,$_inp);
267 ## &mov ($out,$_out);
268 ## &vpand ($temp,$temp,$mask14);
269 ## &vpor ($iv,$iv,$temp);
270 ## &vmovdqu ($iv,($out,$inp);
271 ## &lea (inp,16($inp));
275 my ($a,$b,$c,$d,$e,$f,$g,$h);
277 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
278 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
280 $arg = "\$$arg" if ($arg*1 eq $arg);
281 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
286 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
288 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
293 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
294 '&xor ($a4,$g)', # f^g
296 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
298 '&and ($a4,$e)', # (f^g)&e
300 @aesni_cbc_block[$aesni_cbc_idx++].
302 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
305 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
306 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
307 '&xor ($a2,$b)', # a^b, b^c in next round
309 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
310 '&add ($h,$a4)', # h+=Ch(e,f,g)
311 '&and ($a3,$a2)', # (b^c)&(a^b)
314 '&add ($h,$a0)', # h+=Sigma1(e)
315 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
317 '&add ($d,$h)', # d+=h
318 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
319 '&add ($h,$a3)', # h+=Maj(a,b,c)
322 '&add ($a1,$h);'. # h+=Sigma0(a)
323 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
328 ######################################################################
332 .type ${func}_xop,\@function,6
336 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
343 mov %rsp,%r11 # copy %rsp
344 sub \$`$framesz+$win64*16*10`,%rsp
345 and \$-64,%rsp # align stack frame
348 sub $inp,$out # re-bias
350 add $inp,$len # end of input
352 #mov $inp,$_inp # saved later
355 #mov $key,$_key # remains resident in $inp register
361 $code.=<<___ if ($win64);
362 movaps %xmm6,`$framesz+16*0`(%rsp)
363 movaps %xmm7,`$framesz+16*1`(%rsp)
364 movaps %xmm8,`$framesz+16*2`(%rsp)
365 movaps %xmm9,`$framesz+16*3`(%rsp)
366 movaps %xmm10,`$framesz+16*4`(%rsp)
367 movaps %xmm11,`$framesz+16*5`(%rsp)
368 movaps %xmm12,`$framesz+16*6`(%rsp)
369 movaps %xmm13,`$framesz+16*7`(%rsp)
370 movaps %xmm14,`$framesz+16*8`(%rsp)
371 movaps %xmm15,`$framesz+16*9`(%rsp)
377 mov $inp,%r12 # borrow $a4
378 lea 0x80($key),$inp # size optimization, reassign
379 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
380 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
381 mov $ctx,%r15 # borrow $a2
382 mov $in0,%rsi # borrow $a3
383 vmovdqu ($ivp),$iv # load IV
395 vmovdqa 0x00(%r13,%r14,8),$mask14
396 vmovdqa 0x10(%r13,%r14,8),$mask12
397 vmovdqa 0x20(%r13,%r14,8),$mask10
398 vmovdqu 0x00-0x80($inp),$roundkey
401 if ($SZ==4) { # SHA256
402 my @X = map("%xmm$_",(0..3));
403 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
408 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
409 vmovdqu 0x00(%rsi,%r12),@X[0]
410 vmovdqu 0x10(%rsi,%r12),@X[1]
411 vmovdqu 0x20(%rsi,%r12),@X[2]
412 vmovdqu 0x30(%rsi,%r12),@X[3]
413 vpshufb $t3,@X[0],@X[0]
414 lea $TABLE(%rip),$Tbl
415 vpshufb $t3,@X[1],@X[1]
416 vpshufb $t3,@X[2],@X[2]
417 vpaddd 0x00($Tbl),@X[0],$t0
418 vpshufb $t3,@X[3],@X[3]
419 vpaddd 0x20($Tbl),@X[1],$t1
420 vpaddd 0x40($Tbl),@X[2],$t2
421 vpaddd 0x60($Tbl),@X[3],$t3
422 vmovdqa $t0,0x00(%rsp)
424 vmovdqa $t1,0x10(%rsp)
426 vmovdqa $t2,0x20(%rsp)
428 vmovdqa $t3,0x30(%rsp)
434 sub \$-16*2*$SZ,$Tbl # size optimization
435 vmovdqu (%r12),$inout # $a4
438 sub XOP_256_00_47 () {
442 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
444 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
447 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
450 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
453 &vpsrld ($t0,$t0,$sigma0[2]);
456 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
461 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
464 &vpxor ($t0,$t0,$t1);
469 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
472 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
475 &vpsrld ($t2,@X[3],$sigma1[2]);
478 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
481 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
484 &vpxor ($t3,$t3,$t2);
489 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
494 &vpsrldq ($t3,$t3,8);
499 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
504 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
507 &vpsrld ($t2,@X[0],$sigma1[2]);
510 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
513 &vpxor ($t3,$t3,$t2);
518 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
523 &vpslldq ($t3,$t3,8); # 22 instructions
528 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
533 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
534 foreach (@insns) { eval; } # remaining instructions
535 &vmovdqa (16*$j."(%rsp)",$t2);
539 for ($i=0,$j=0; $j<4; $j++) {
540 &XOP_256_00_47($j,\&body_00_15,@X);
541 push(@X,shift(@X)); # rotate(@X)
543 &mov ("%r12",$_inp); # borrow $a4
544 &vpand ($temp,$temp,$mask14);
545 &mov ("%r15",$_out); # borrow $a2
546 &vpor ($iv,$iv,$temp);
547 &vmovdqu ("(%r15,%r12)",$iv); # write output
548 &lea ("%r12","16(%r12)"); # inp++
550 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
551 &jne (".Lxop_00_47");
553 &vmovdqu ($inout,"(%r12)");
557 for ($i=0; $i<16; ) {
558 foreach(body_00_15()) { eval; }
562 mov $_inp,%r12 # borrow $a4
563 mov $_out,%r13 # borrow $a0
564 mov $_ctx,%r15 # borrow $a2
565 mov $_in0,%rsi # borrow $a3
567 vpand $mask14,$temp,$temp
570 vmovdqu $iv,(%r13,%r12) # write output
571 lea 16(%r12),%r12 # inp++
597 vmovdqu $iv,($ivp) # output IV
600 $code.=<<___ if ($win64);
601 movaps `$framesz+16*0`(%rsp),%xmm6
602 movaps `$framesz+16*1`(%rsp),%xmm7
603 movaps `$framesz+16*2`(%rsp),%xmm8
604 movaps `$framesz+16*3`(%rsp),%xmm9
605 movaps `$framesz+16*4`(%rsp),%xmm10
606 movaps `$framesz+16*5`(%rsp),%xmm11
607 movaps `$framesz+16*6`(%rsp),%xmm12
608 movaps `$framesz+16*7`(%rsp),%xmm13
609 movaps `$framesz+16*8`(%rsp),%xmm14
610 movaps `$framesz+16*9`(%rsp),%xmm15
622 .size ${func}_xop,.-${func}_xop
624 ######################################################################
627 local *ror = sub { &shrd(@_[0],@_) };
630 .type ${func}_avx,\@function,6
634 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
641 mov %rsp,%r11 # copy %rsp
642 sub \$`$framesz+$win64*16*10`,%rsp
643 and \$-64,%rsp # align stack frame
646 sub $inp,$out # re-bias
648 add $inp,$len # end of input
650 #mov $inp,$_inp # saved later
653 #mov $key,$_key # remains resident in $inp register
659 $code.=<<___ if ($win64);
660 movaps %xmm6,`$framesz+16*0`(%rsp)
661 movaps %xmm7,`$framesz+16*1`(%rsp)
662 movaps %xmm8,`$framesz+16*2`(%rsp)
663 movaps %xmm9,`$framesz+16*3`(%rsp)
664 movaps %xmm10,`$framesz+16*4`(%rsp)
665 movaps %xmm11,`$framesz+16*5`(%rsp)
666 movaps %xmm12,`$framesz+16*6`(%rsp)
667 movaps %xmm13,`$framesz+16*7`(%rsp)
668 movaps %xmm14,`$framesz+16*8`(%rsp)
669 movaps %xmm15,`$framesz+16*9`(%rsp)
675 mov $inp,%r12 # borrow $a4
676 lea 0x80($key),$inp # size optimization, reassign
677 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
678 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
679 mov $ctx,%r15 # borrow $a2
680 mov $in0,%rsi # borrow $a3
681 vmovdqu ($ivp),$iv # load IV
693 vmovdqa 0x00(%r13,%r14,8),$mask14
694 vmovdqa 0x10(%r13,%r14,8),$mask12
695 vmovdqa 0x20(%r13,%r14,8),$mask10
696 vmovdqu 0x00-0x80($inp),$roundkey
698 if ($SZ==4) { # SHA256
699 my @X = map("%xmm$_",(0..3));
700 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
706 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
707 vmovdqu 0x00(%rsi,%r12),@X[0]
708 vmovdqu 0x10(%rsi,%r12),@X[1]
709 vmovdqu 0x20(%rsi,%r12),@X[2]
710 vmovdqu 0x30(%rsi,%r12),@X[3]
711 vpshufb $t3,@X[0],@X[0]
712 lea $TABLE(%rip),$Tbl
713 vpshufb $t3,@X[1],@X[1]
714 vpshufb $t3,@X[2],@X[2]
715 vpaddd 0x00($Tbl),@X[0],$t0
716 vpshufb $t3,@X[3],@X[3]
717 vpaddd 0x20($Tbl),@X[1],$t1
718 vpaddd 0x40($Tbl),@X[2],$t2
719 vpaddd 0x60($Tbl),@X[3],$t3
720 vmovdqa $t0,0x00(%rsp)
722 vmovdqa $t1,0x10(%rsp)
724 vmovdqa $t2,0x20(%rsp)
726 vmovdqa $t3,0x30(%rsp)
732 sub \$-16*2*$SZ,$Tbl # size optimization
733 vmovdqu (%r12),$inout # $a4
736 sub Xupdate_256_AVX () {
738 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
739 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
740 '&vpsrld ($t2,$t0,$sigma0[0]);',
741 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
742 '&vpsrld ($t3,$t0,$sigma0[2])',
743 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
744 '&vpxor ($t0,$t3,$t2)',
745 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
746 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
747 '&vpxor ($t0,$t0,$t1)',
748 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
749 '&vpxor ($t0,$t0,$t2)',
750 '&vpsrld ($t2,$t3,$sigma1[2]);',
751 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
752 '&vpsrlq ($t3,$t3,$sigma1[0]);',
753 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
754 '&vpxor ($t2,$t2,$t3);',
755 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
756 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
757 '&vpshufd ($t2,$t2,0b10000100)',
758 '&vpsrldq ($t2,$t2,8)',
759 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
760 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
761 '&vpsrld ($t2,$t3,$sigma1[2])',
762 '&vpsrlq ($t3,$t3,$sigma1[0])',
763 '&vpxor ($t2,$t2,$t3);',
764 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
765 '&vpxor ($t2,$t2,$t3)',
766 '&vpshufd ($t2,$t2,0b11101000)',
767 '&vpslldq ($t2,$t2,8)',
768 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
772 sub AVX_256_00_47 () {
776 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
778 foreach (Xupdate_256_AVX()) { # 29 instructions
784 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
785 foreach (@insns) { eval; } # remaining instructions
786 &vmovdqa (16*$j."(%rsp)",$t2);
790 for ($i=0,$j=0; $j<4; $j++) {
791 &AVX_256_00_47($j,\&body_00_15,@X);
792 push(@X,shift(@X)); # rotate(@X)
794 &mov ("%r12",$_inp); # borrow $a4
795 &vpand ($temp,$temp,$mask14);
796 &mov ("%r15",$_out); # borrow $a2
797 &vpor ($iv,$iv,$temp);
798 &vmovdqu ("(%r15,%r12)",$iv); # write output
799 &lea ("%r12","16(%r12)"); # inp++
801 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
802 &jne (".Lavx_00_47");
804 &vmovdqu ($inout,"(%r12)");
808 for ($i=0; $i<16; ) {
809 foreach(body_00_15()) { eval; }
814 mov $_inp,%r12 # borrow $a4
815 mov $_out,%r13 # borrow $a0
816 mov $_ctx,%r15 # borrow $a2
817 mov $_in0,%rsi # borrow $a3
819 vpand $mask14,$temp,$temp
822 vmovdqu $iv,(%r13,%r12) # write output
823 lea 16(%r12),%r12 # inp++
848 vmovdqu $iv,($ivp) # output IV
851 $code.=<<___ if ($win64);
852 movaps `$framesz+16*0`(%rsp),%xmm6
853 movaps `$framesz+16*1`(%rsp),%xmm7
854 movaps `$framesz+16*2`(%rsp),%xmm8
855 movaps `$framesz+16*3`(%rsp),%xmm9
856 movaps `$framesz+16*4`(%rsp),%xmm10
857 movaps `$framesz+16*5`(%rsp),%xmm11
858 movaps `$framesz+16*6`(%rsp),%xmm12
859 movaps `$framesz+16*7`(%rsp),%xmm13
860 movaps `$framesz+16*8`(%rsp),%xmm14
861 movaps `$framesz+16*9`(%rsp),%xmm15
873 .size ${func}_avx,.-${func}_avx
877 ######################################################################
880 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
885 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
887 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
889 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
890 '&and ($a4,$e)', # f&e
891 '&rorx ($a0,$e,$Sigma1[2])',
892 '&rorx ($a2,$e,$Sigma1[1])',
894 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
895 '&lea ($h,"($h,$a4)")',
896 '&andn ($a4,$e,$g)', # ~e&g
899 '&rorx ($a1,$e,$Sigma1[0])',
900 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
901 '&xor ($a0,$a1)', # Sigma1(e)
904 '&rorx ($a4,$a,$Sigma0[2])',
905 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
906 '&xor ($a2,$b)', # a^b, b^c in next round
907 '&rorx ($a1,$a,$Sigma0[1])',
909 '&rorx ($a0,$a,$Sigma0[0])',
910 '&lea ($d,"($d,$h)")', # d+=h
911 '&and ($a3,$a2)', # (b^c)&(a^b)
912 @aesni_cbc_block[$aesni_cbc_idx++].
915 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
916 '&xor ($a1,$a0)', # Sigma0(a)
917 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
918 '&mov ($a4,$e)', # copy of f in future
920 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
922 # and at the finish one has to $a+=$a1
926 .type ${func}_avx2,\@function,6
930 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
937 mov %rsp,%r11 # copy %rsp
938 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
939 and \$-256*$SZ,%rsp # align stack frame
940 add \$`2*$SZ*($rounds-8)`,%rsp
943 sub $inp,$out # re-bias
945 add $inp,$len # end of input
947 #mov $inp,$_inp # saved later
948 #mov $out,$_out # kept in $offload
950 #mov $key,$_key # remains resident in $inp register
956 $code.=<<___ if ($win64);
957 movaps %xmm6,`$framesz+16*0`(%rsp)
958 movaps %xmm7,`$framesz+16*1`(%rsp)
959 movaps %xmm8,`$framesz+16*2`(%rsp)
960 movaps %xmm9,`$framesz+16*3`(%rsp)
961 movaps %xmm10,`$framesz+16*4`(%rsp)
962 movaps %xmm11,`$framesz+16*5`(%rsp)
963 movaps %xmm12,`$framesz+16*6`(%rsp)
964 movaps %xmm13,`$framesz+16*7`(%rsp)
965 movaps %xmm14,`$framesz+16*8`(%rsp)
966 movaps %xmm15,`$framesz+16*9`(%rsp)
972 mov $inp,%r13 # borrow $a0
973 vpinsrq \$1,$out,$offload,$offload
974 lea 0x80($key),$inp # size optimization, reassign
975 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
976 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
977 mov $ctx,%r15 # borrow $a2
978 mov $in0,%rsi # borrow $a3
979 vmovdqu ($ivp),$iv # load IV
982 vmovdqa 0x00(%r12,%r14,8),$mask14
983 vmovdqa 0x10(%r12,%r14,8),$mask12
984 vmovdqa 0x20(%r12,%r14,8),$mask10
986 sub \$-16*$SZ,%r13 # inp++, size optimization
988 lea (%rsi,%r13),%r12 # borrow $a0
990 cmp $len,%r13 # $_end
992 cmove %rsp,%r12 # next block or random data
998 vmovdqu 0x00-0x80($inp),$roundkey
1000 if ($SZ==4) { # SHA256
1001 my @X = map("%ymm$_",(0..3));
1002 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1008 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1009 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1010 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1011 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1012 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1014 vinserti128 \$1,(%r12),@X[0],@X[0]
1015 vinserti128 \$1,16(%r12),@X[1],@X[1]
1016 vpshufb $t3,@X[0],@X[0]
1017 vinserti128 \$1,32(%r12),@X[2],@X[2]
1018 vpshufb $t3,@X[1],@X[1]
1019 vinserti128 \$1,48(%r12),@X[3],@X[3]
1021 lea $TABLE(%rip),$Tbl
1022 vpshufb $t3,@X[2],@X[2]
1023 lea -16*$SZ(%r13),%r13
1024 vpaddd 0x00($Tbl),@X[0],$t0
1025 vpshufb $t3,@X[3],@X[3]
1026 vpaddd 0x20($Tbl),@X[1],$t1
1027 vpaddd 0x40($Tbl),@X[2],$t2
1028 vpaddd 0x60($Tbl),@X[3],$t3
1029 vmovdqa $t0,0x00(%rsp)
1031 vmovdqa $t1,0x20(%rsp)
1032 lea -$PUSH8(%rsp),%rsp
1034 vmovdqa $t2,0x00(%rsp)
1036 vmovdqa $t3,0x20(%rsp)
1038 sub \$-16*2*$SZ,$Tbl # size optimization
1043 vmovdqu (%r13),$inout
1044 vpinsrq \$0,%r13,$offload,$offload
1047 sub AVX2_256_00_47 () {
1051 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1052 my $base = "+2*$PUSH8(%rsp)";
1054 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1055 foreach (Xupdate_256_AVX()) { # 29 instructions
1057 eval(shift(@insns));
1058 eval(shift(@insns));
1059 eval(shift(@insns));
1061 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1062 foreach (@insns) { eval; } # remaining instructions
1063 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1066 for ($i=0,$j=0; $j<4; $j++) {
1067 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1068 push(@X,shift(@X)); # rotate(@X)
1070 &vmovq ("%r13",$offload); # borrow $a0
1071 &vpextrq ("%r15",$offload,1); # borrow $a2
1072 &vpand ($temp,$temp,$mask14);
1073 &vpor ($iv,$iv,$temp);
1074 &vmovdqu ("(%r15,%r13)",$iv); # write output
1075 &lea ("%r13","16(%r13)"); # inp++
1077 &lea ($Tbl,16*2*$SZ."($Tbl)");
1078 &cmpb (($SZ-1)."($Tbl)",0);
1079 &jne (".Lavx2_00_47");
1081 &vmovdqu ($inout,"(%r13)");
1082 &vpinsrq ($offload,$offload,"%r13",0);
1085 for ($i=0; $i<16; ) {
1086 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1087 foreach(bodyx_00_15()) { eval; }
1091 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1092 vmovq $offload,%r13 # $_inp, borrow $a0
1093 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1095 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1097 vpand $mask14,$temp,$temp
1099 vmovdqu $iv,(%r12,%r13) # write output
1120 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1130 vmovdqu (%r13),$inout
1131 vpinsrq \$0,%r13,$offload,$offload
1134 for ($i=0; $i<16; ) {
1135 my $base="+16($Tbl)";
1136 foreach(bodyx_00_15()) { eval; }
1137 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1140 vmovq $offload,%r13 # borrow $a0
1141 vpextrq \$1,$offload,%r15 # borrow $a2
1142 vpand $mask14,$temp,$temp
1144 lea -$PUSH8($Tbl),$Tbl
1145 vmovdqu $iv,(%r15,%r13) # write output
1146 lea 16(%r13),%r13 # inp++
1150 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1151 lea 16*$SZ(%r13),%r13
1152 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1154 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1163 lea (%rsi,%r13),%r12
1169 cmove %rsp,%r12 # next block or stale data
1185 vmovdqu $iv,($ivp) # output IV
1188 $code.=<<___ if ($win64);
1189 movaps `$framesz+16*0`(%rsp),%xmm6
1190 movaps `$framesz+16*1`(%rsp),%xmm7
1191 movaps `$framesz+16*2`(%rsp),%xmm8
1192 movaps `$framesz+16*3`(%rsp),%xmm9
1193 movaps `$framesz+16*4`(%rsp),%xmm10
1194 movaps `$framesz+16*5`(%rsp),%xmm11
1195 movaps `$framesz+16*6`(%rsp),%xmm12
1196 movaps `$framesz+16*7`(%rsp),%xmm13
1197 movaps `$framesz+16*8`(%rsp),%xmm14
1198 movaps `$framesz+16*9`(%rsp),%xmm15
1210 .size ${func}_avx2,.-${func}_avx2
1215 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1217 my ($rounds,$Tbl)=("%r11d","%rbx");
1219 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1220 my @rndkey=("%xmm4","%xmm5");
1224 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1225 my @MSG=map("%xmm$_",(10..13));
1229 my ($n,$k)=($r/10,$r%10);
1232 movups `16*$n`($in0),$in # load input
1235 $code.=<<___ if ($n);
1236 movups $iv,`16*($n-1)`($out,$in0) # write output
1240 movups `32+16*$k-112`($key),$rndkey[1]
1241 aesenc $rndkey[0],$iv
1248 movups `32+16*($k+0)-112`($key),$rndkey[1]
1249 aesenc $rndkey[0],$iv
1250 movups `32+16*($k+1)-112`($key),$rndkey[0]
1251 aesenc $rndkey[1],$iv
1253 movups `32+16*($k+2)-112`($key),$rndkey[1]
1254 aesenc $rndkey[0],$iv
1255 movups `32+16*($k+3)-112`($key),$rndkey[0]
1256 aesenc $rndkey[1],$iv
1258 aesenclast $rndkey[0],$iv
1259 movups 16-112($key),$rndkey[1] # forward reference
1264 movups `32+16*$k-112`($key),$rndkey[1]
1265 aesenc $rndkey[0],$iv
1268 $r++; unshift(@rndkey,pop(@rndkey));
1275 .type ${func}_shaext,\@function,6
1278 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
1280 $code.=<<___ if ($win64);
1281 lea `-8-10*16`(%rsp),%rsp
1282 movaps %xmm6,-8-10*16(%rax)
1283 movaps %xmm7,-8-9*16(%rax)
1284 movaps %xmm8,-8-8*16(%rax)
1285 movaps %xmm9,-8-7*16(%rax)
1286 movaps %xmm10,-8-6*16(%rax)
1287 movaps %xmm11,-8-5*16(%rax)
1288 movaps %xmm12,-8-4*16(%rax)
1289 movaps %xmm13,-8-3*16(%rax)
1290 movaps %xmm14,-8-2*16(%rax)
1291 movaps %xmm15,-8-1*16(%rax)
1295 lea K256+0x80(%rip),$Tbl
1296 movdqu ($ctx),$ABEF # DCBA
1297 movdqu 16($ctx),$CDGH # HGFE
1298 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1300 mov 240($key),$rounds
1302 movups ($key),$rndkey0 # $key[0]
1303 movups 16($key),$rndkey[0] # forward reference
1304 lea 112($key),$key # size optimization
1306 pshufd \$0x1b,$ABEF,$Wi # ABCD
1307 pshufd \$0xb1,$ABEF,$ABEF # CDAB
1308 pshufd \$0x1b,$CDGH,$CDGH # EFGH
1309 movdqa $TMP,$BSWAP # offload
1310 palignr \$8,$CDGH,$ABEF # ABEF
1311 punpcklqdq $Wi,$CDGH # CDGH
1317 movdqu ($inp),@MSG[0]
1318 movdqu 0x10($inp),@MSG[1]
1319 movdqu 0x20($inp),@MSG[2]
1321 movdqu 0x30($inp),@MSG[3]
1323 movdqa 0*32-0x80($Tbl),$Wi
1326 movdqa $CDGH,$CDGH_SAVE # offload
1327 movdqa $ABEF,$ABEF_SAVE # offload
1331 sha256rnds2 $ABEF,$CDGH # 0-3
1332 pshufd \$0x0e,$Wi,$Wi
1336 sha256rnds2 $CDGH,$ABEF
1338 movdqa 1*32-0x80($Tbl),$Wi
1345 sha256rnds2 $ABEF,$CDGH # 4-7
1346 pshufd \$0x0e,$Wi,$Wi
1350 sha256rnds2 $CDGH,$ABEF
1352 movdqa 2*32-0x80($Tbl),$Wi
1355 sha256msg1 @MSG[1],@MSG[0]
1359 sha256rnds2 $ABEF,$CDGH # 8-11
1360 pshufd \$0x0e,$Wi,$Wi
1362 palignr \$4,@MSG[2],$TMP
1367 sha256rnds2 $CDGH,$ABEF
1369 movdqa 3*32-0x80($Tbl),$Wi
1371 sha256msg2 @MSG[3],@MSG[0]
1372 sha256msg1 @MSG[2],@MSG[1]
1376 sha256rnds2 $ABEF,$CDGH # 12-15
1377 pshufd \$0x0e,$Wi,$Wi
1382 palignr \$4,@MSG[3],$TMP
1384 sha256rnds2 $CDGH,$ABEF
1386 for($i=4;$i<16-3;$i++) {
1387 &$aesenc() if (($r%10)==0);
1389 movdqa $i*32-0x80($Tbl),$Wi
1391 sha256msg2 @MSG[0],@MSG[1]
1392 sha256msg1 @MSG[3],@MSG[2]
1396 sha256rnds2 $ABEF,$CDGH # 16-19...
1397 pshufd \$0x0e,$Wi,$Wi
1399 palignr \$4,@MSG[0],$TMP
1403 &$aesenc() if ($r==19);
1405 sha256rnds2 $CDGH,$ABEF
1407 push(@MSG,shift(@MSG));
1410 movdqa 13*32-0x80($Tbl),$Wi
1412 sha256msg2 @MSG[0],@MSG[1]
1413 sha256msg1 @MSG[3],@MSG[2]
1417 sha256rnds2 $ABEF,$CDGH # 52-55
1418 pshufd \$0x0e,$Wi,$Wi
1420 palignr \$4,@MSG[0],$TMP
1426 sha256rnds2 $CDGH,$ABEF
1428 movdqa 14*32-0x80($Tbl),$Wi
1430 sha256msg2 @MSG[1],@MSG[2]
1435 sha256rnds2 $ABEF,$CDGH # 56-59
1436 pshufd \$0x0e,$Wi,$Wi
1440 sha256rnds2 $CDGH,$ABEF
1442 movdqa 15*32-0x80($Tbl),$Wi
1448 sha256rnds2 $ABEF,$CDGH # 60-63
1449 pshufd \$0x0e,$Wi,$Wi
1453 sha256rnds2 $CDGH,$ABEF
1454 #pxor $CDGH,$rndkey0 # black magic
1456 while ($r<40) { &$aesenc(); } # remaining aesenc's
1458 #xorps $CDGH,$rndkey0 # black magic
1459 paddd $CDGH_SAVE,$CDGH
1460 paddd $ABEF_SAVE,$ABEF
1463 movups $iv,48($out,$in0) # write output
1467 pshufd \$0xb1,$CDGH,$CDGH # DCHG
1468 pshufd \$0x1b,$ABEF,$TMP # FEBA
1469 pshufd \$0xb1,$ABEF,$ABEF # BAFE
1470 punpckhqdq $CDGH,$ABEF # DCBA
1471 palignr \$8,$TMP,$CDGH # HGFE
1473 movups $iv,($ivp) # write IV
1475 movdqu $CDGH,16($ctx)
1477 $code.=<<___ if ($win64);
1478 movaps 0*16(%rsp),%xmm6
1479 movaps 1*16(%rsp),%xmm7
1480 movaps 2*16(%rsp),%xmm8
1481 movaps 3*16(%rsp),%xmm9
1482 movaps 4*16(%rsp),%xmm10
1483 movaps 5*16(%rsp),%xmm11
1484 movaps 6*16(%rsp),%xmm12
1485 movaps 7*16(%rsp),%xmm13
1486 movaps 8*16(%rsp),%xmm14
1487 movaps 9*16(%rsp),%xmm15
1488 lea 8+10*16(%rsp),%rsp
1493 .size ${func}_shaext,.-${func}_shaext
1498 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1499 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1500 if ($win64 && $avx) {
1507 .extern __imp_RtlVirtualUnwind
1508 .type se_handler,\@abi-omnipotent
1522 mov 120($context),%rax # pull context->Rax
1523 mov 248($context),%rbx # pull context->Rip
1525 mov 8($disp),%rsi # disp->ImageBase
1526 mov 56($disp),%r11 # disp->HanderlData
1528 mov 0(%r11),%r10d # HandlerData[0]
1529 lea (%rsi,%r10),%r10 # prologue label
1530 cmp %r10,%rbx # context->Rip<prologue label
1533 mov 152($context),%rax # pull context->Rsp
1535 mov 4(%r11),%r10d # HandlerData[1]
1536 lea (%rsi,%r10),%r10 # epilogue label
1537 cmp %r10,%rbx # context->Rip>=epilogue label
1540 $code.=<<___ if ($shaext);
1541 lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1546 lea 512($context),%rdi # &context.Xmm6
1548 .long 0xa548f3fc # cld; rep movsq
1549 lea 168(%rax),%rax # adjust stack pointer
1553 $code.=<<___ if ($avx>1);
1554 lea .Lavx2_shortcut(%rip),%r10
1555 cmp %r10,%rbx # context->Rip<avx2_shortcut
1559 add \$`2*$SZ*($rounds-8)`,%rax
1563 mov %rax,%rsi # put aside Rsp
1564 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1573 mov %rbx,144($context) # restore context->Rbx
1574 mov %rbp,160($context) # restore context->Rbp
1575 mov %r12,216($context) # restore context->R12
1576 mov %r13,224($context) # restore context->R13
1577 mov %r14,232($context) # restore context->R14
1578 mov %r15,240($context) # restore context->R15
1580 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1581 lea 512($context),%rdi # &context.Xmm6
1583 .long 0xa548f3fc # cld; rep movsq
1588 mov %rax,152($context) # restore context->Rsp
1589 mov %rsi,168($context) # restore context->Rsi
1590 mov %rdi,176($context) # restore context->Rdi
1592 mov 40($disp),%rdi # disp->ContextRecord
1593 mov $context,%rsi # context
1594 mov \$154,%ecx # sizeof(CONTEXT)
1595 .long 0xa548f3fc # cld; rep movsq
1598 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1599 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1600 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1601 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1602 mov 40(%rsi),%r10 # disp->ContextRecord
1603 lea 56(%rsi),%r11 # &disp->HandlerData
1604 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1605 mov %r10,32(%rsp) # arg5
1606 mov %r11,40(%rsp) # arg6
1607 mov %r12,48(%rsp) # arg7
1608 mov %rcx,56(%rsp) # arg8, (NULL)
1609 call *__imp_RtlVirtualUnwind(%rip)
1611 mov \$1,%eax # ExceptionContinueSearch
1623 .size se_handler,.-se_handler
1626 .rva .LSEH_begin_${func}_xop
1627 .rva .LSEH_end_${func}_xop
1628 .rva .LSEH_info_${func}_xop
1630 .rva .LSEH_begin_${func}_avx
1631 .rva .LSEH_end_${func}_avx
1632 .rva .LSEH_info_${func}_avx
1634 $code.=<<___ if ($avx>1);
1635 .rva .LSEH_begin_${func}_avx2
1636 .rva .LSEH_end_${func}_avx2
1637 .rva .LSEH_info_${func}_avx2
1639 $code.=<<___ if ($shaext);
1640 .rva .LSEH_begin_${func}_shaext
1641 .rva .LSEH_end_${func}_shaext
1642 .rva .LSEH_info_${func}_shaext
1647 .LSEH_info_${func}_xop:
1650 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1652 .LSEH_info_${func}_avx:
1655 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1657 $code.=<<___ if ($avx>1);
1658 .LSEH_info_${func}_avx2:
1661 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1663 $code.=<<___ if ($shaext);
1664 .LSEH_info_${func}_shaext:
1667 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1671 ####################################################################
1673 local *opcode=shift;
1677 $rex|=0x04 if($dst>=8);
1678 $rex|=0x01 if($src>=8);
1679 unshift @opcode,$rex|0x40 if($rex);
1684 "sha256rnds2" => 0xcb,
1685 "sha256msg1" => 0xcc,
1686 "sha256msg2" => 0xcd );
1691 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1692 my @opcode=(0x0f,0x38);
1693 rex(\@opcode,$2,$1);
1694 push @opcode,$opcodelet{$instr};
1695 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1696 return ".byte\t".join(',',@opcode);
1698 return $instr."\t".@_[0];
1703 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1704 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;