2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
21 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
22 # parallelism, interleaving it with another algorithm would allow to
23 # utilize processor resources better and achieve better performance.
24 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25 # AESNI code is weaved into it. As SHA256 dominates execution time,
26 # stitch performance does not depend on AES key length. Below are
27 # performance numbers in cycles per processed byte, less is better,
28 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
31 # AES-128/-192/-256+SHA256 this(**)gain
32 # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
33 # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
34 # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
35 # Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
36 # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
38 # (*) there are XOP, AVX1 and AVX2 code paths, meaning that
39 # Westmere is omitted from loop, this is because gain was not
40 # estimated high enough to justify the effort;
41 # (**) these are EVP-free results, results obtained with 'speed
42 # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
46 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
48 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
50 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
51 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
52 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
53 die "can't locate x86_64-xlate.pl";
55 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
56 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
57 $avx = ($1>=2.19) + ($1>=2.22);
60 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
61 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
62 $avx = ($1>=2.09) + ($1>=2.10);
65 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
66 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
67 $avx = ($1>=10) + ($1>=12);
70 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
71 $avx = ($2>=3.0) + ($2>3.0);
74 $shaext=$avx; ### set to zero if compiling for 1.0.1
75 $avx=1 if (!$shaext && $avx);
77 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
80 $func="aesni_cbc_sha256_enc";
83 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
84 "%r8d","%r9d","%r10d","%r11d");
85 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
92 ########################################################################
93 # void aesni_cbc_sha256_enc(const void *inp,
100 ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
101 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
105 $_inp="16*$SZ+0*8(%rsp)";
106 $_out="16*$SZ+1*8(%rsp)";
107 $_end="16*$SZ+2*8(%rsp)";
108 $_key="16*$SZ+3*8(%rsp)";
109 $_ivp="16*$SZ+4*8(%rsp)";
110 $_ctx="16*$SZ+5*8(%rsp)";
111 $_in0="16*$SZ+6*8(%rsp)";
112 $_rsp="`16*$SZ+7*8`(%rsp)";
118 .extern OPENSSL_ia32cap_P
120 .type $func,\@abi-omnipotent
126 lea OPENSSL_ia32cap_P(%rip),%r11
128 cmp \$0,`$win64?"%rcx":"%rdi"`
133 $code.=<<___ if ($shaext);
134 bt \$61,%r10 # check for SHA
141 test \$`1<<11`,%r10d # check for XOP
144 $code.=<<___ if ($avx>1);
145 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
146 cmp \$`1<<8|1<<5|1<<3`,%r11d
150 and \$`1<<28`,%r10d # check for AVX
157 cmp \$0,`$win64?"%rcx":"%rdi"`
165 .type $TABLE,\@object
167 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
168 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
169 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
170 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
171 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
172 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
173 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
174 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
175 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
176 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
177 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
178 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
179 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
180 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
181 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
182 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
183 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
184 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
185 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
186 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
187 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
188 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
189 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
190 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
191 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
192 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
193 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
194 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
195 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
196 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
197 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
198 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
200 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
201 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
202 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
203 .long 0,0,0,0, 0,0,0,0
204 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
208 ######################################################################
212 ($iv,$inout,$roundkey,$temp,
213 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
217 ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
218 ## &vmovdqu ($inout,($inp));
219 ## &mov ($_inp,$inp);
221 '&vpxor ($inout,$inout,$roundkey);'.
222 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
224 '&vpxor ($inout,$inout,$iv);',
226 '&vaesenc ($inout,$inout,$roundkey);'.
227 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
229 '&vaesenc ($inout,$inout,$roundkey);'.
230 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
232 '&vaesenc ($inout,$inout,$roundkey);'.
233 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
235 '&vaesenc ($inout,$inout,$roundkey);'.
236 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
238 '&vaesenc ($inout,$inout,$roundkey);'.
239 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
241 '&vaesenc ($inout,$inout,$roundkey);'.
242 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
244 '&vaesenc ($inout,$inout,$roundkey);'.
245 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
247 '&vaesenc ($inout,$inout,$roundkey);'.
248 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
250 '&vaesenc ($inout,$inout,$roundkey);'.
251 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
253 '&vaesenclast ($temp,$inout,$roundkey);'.
254 ' &vaesenc ($inout,$inout,$roundkey);'.
255 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
257 '&vpand ($iv,$temp,$mask10);'.
258 ' &vaesenc ($inout,$inout,$roundkey);'.
259 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
261 '&vaesenclast ($temp,$inout,$roundkey);'.
262 ' &vaesenc ($inout,$inout,$roundkey);'.
263 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
265 '&vpand ($temp,$temp,$mask12);'.
266 ' &vaesenc ($inout,$inout,$roundkey);'.
267 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
269 '&vpor ($iv,$iv,$temp);'.
270 ' &vaesenclast ($temp,$inout,$roundkey);'.
271 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
273 ## &mov ($inp,$_inp);
274 ## &mov ($out,$_out);
275 ## &vpand ($temp,$temp,$mask14);
276 ## &vpor ($iv,$iv,$temp);
277 ## &vmovdqu ($iv,($out,$inp);
278 ## &lea (inp,16($inp));
282 my ($a,$b,$c,$d,$e,$f,$g,$h);
284 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
285 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
287 $arg = "\$$arg" if ($arg*1 eq $arg);
288 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
293 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
295 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
300 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
301 '&xor ($a4,$g)', # f^g
303 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
305 '&and ($a4,$e)', # (f^g)&e
307 @aesni_cbc_block[$aesni_cbc_idx++].
309 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
312 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
313 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
314 '&xor ($a2,$b)', # a^b, b^c in next round
316 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
317 '&add ($h,$a4)', # h+=Ch(e,f,g)
318 '&and ($a3,$a2)', # (b^c)&(a^b)
321 '&add ($h,$a0)', # h+=Sigma1(e)
322 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
324 '&add ($d,$h)', # d+=h
325 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
326 '&add ($h,$a3)', # h+=Maj(a,b,c)
329 '&add ($a1,$h);'. # h+=Sigma0(a)
330 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
335 ######################################################################
339 .type ${func}_xop,\@function,6
344 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
345 mov %rsp,%rax # copy %rsp
346 .cfi_def_cfa_register %rax
359 sub \$`$framesz+$win64*16*10`,%rsp
360 and \$-64,%rsp # align stack frame
363 sub $inp,$out # re-bias
365 add $inp,$len # end of input
367 #mov $inp,$_inp # saved later
370 #mov $key,$_key # remains resident in $inp register
375 .cfi_cfa_expression $_rsp,deref,+8
377 $code.=<<___ if ($win64);
378 movaps %xmm6,`$framesz+16*0`(%rsp)
379 movaps %xmm7,`$framesz+16*1`(%rsp)
380 movaps %xmm8,`$framesz+16*2`(%rsp)
381 movaps %xmm9,`$framesz+16*3`(%rsp)
382 movaps %xmm10,`$framesz+16*4`(%rsp)
383 movaps %xmm11,`$framesz+16*5`(%rsp)
384 movaps %xmm12,`$framesz+16*6`(%rsp)
385 movaps %xmm13,`$framesz+16*7`(%rsp)
386 movaps %xmm14,`$framesz+16*8`(%rsp)
387 movaps %xmm15,`$framesz+16*9`(%rsp)
393 mov $inp,%r12 # borrow $a4
394 lea 0x80($key),$inp # size optimization, reassign
395 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
396 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
397 mov $ctx,%r15 # borrow $a2
398 mov $in0,%rsi # borrow $a3
399 vmovdqu ($ivp),$iv # load IV
411 vmovdqa 0x00(%r13,%r14,8),$mask14
412 vmovdqa 0x10(%r13,%r14,8),$mask12
413 vmovdqa 0x20(%r13,%r14,8),$mask10
414 vmovdqu 0x00-0x80($inp),$roundkey
417 if ($SZ==4) { # SHA256
418 my @X = map("%xmm$_",(0..3));
419 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
424 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
425 vmovdqu 0x00(%rsi,%r12),@X[0]
426 vmovdqu 0x10(%rsi,%r12),@X[1]
427 vmovdqu 0x20(%rsi,%r12),@X[2]
428 vmovdqu 0x30(%rsi,%r12),@X[3]
429 vpshufb $t3,@X[0],@X[0]
430 lea $TABLE(%rip),$Tbl
431 vpshufb $t3,@X[1],@X[1]
432 vpshufb $t3,@X[2],@X[2]
433 vpaddd 0x00($Tbl),@X[0],$t0
434 vpshufb $t3,@X[3],@X[3]
435 vpaddd 0x20($Tbl),@X[1],$t1
436 vpaddd 0x40($Tbl),@X[2],$t2
437 vpaddd 0x60($Tbl),@X[3],$t3
438 vmovdqa $t0,0x00(%rsp)
440 vmovdqa $t1,0x10(%rsp)
442 vmovdqa $t2,0x20(%rsp)
444 vmovdqa $t3,0x30(%rsp)
450 sub \$-16*2*$SZ,$Tbl # size optimization
451 vmovdqu (%r12),$inout # $a4
454 sub XOP_256_00_47 () {
458 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
460 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
463 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
466 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
469 &vpsrld ($t0,$t0,$sigma0[2]);
472 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
477 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
480 &vpxor ($t0,$t0,$t1);
485 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
488 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
491 &vpsrld ($t2,@X[3],$sigma1[2]);
494 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
497 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
500 &vpxor ($t3,$t3,$t2);
505 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
510 &vpsrldq ($t3,$t3,8);
515 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
520 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
523 &vpsrld ($t2,@X[0],$sigma1[2]);
526 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
529 &vpxor ($t3,$t3,$t2);
534 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
539 &vpslldq ($t3,$t3,8); # 22 instructions
544 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
549 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
550 foreach (@insns) { eval; } # remaining instructions
551 &vmovdqa (16*$j."(%rsp)",$t2);
555 for ($i=0,$j=0; $j<4; $j++) {
556 &XOP_256_00_47($j,\&body_00_15,@X);
557 push(@X,shift(@X)); # rotate(@X)
559 &mov ("%r12",$_inp); # borrow $a4
560 &vpand ($temp,$temp,$mask14);
561 &mov ("%r15",$_out); # borrow $a2
562 &vpor ($iv,$iv,$temp);
563 &vmovdqu ("(%r15,%r12)",$iv); # write output
564 &lea ("%r12","16(%r12)"); # inp++
566 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
567 &jne (".Lxop_00_47");
569 &vmovdqu ($inout,"(%r12)");
573 for ($i=0; $i<16; ) {
574 foreach(body_00_15()) { eval; }
578 mov $_inp,%r12 # borrow $a4
579 mov $_out,%r13 # borrow $a0
580 mov $_ctx,%r15 # borrow $a2
581 mov $_in0,%rsi # borrow $a3
583 vpand $mask14,$temp,$temp
586 vmovdqu $iv,(%r13,%r12) # write output
587 lea 16(%r12),%r12 # inp++
614 vmovdqu $iv,($ivp) # output IV
617 $code.=<<___ if ($win64);
618 movaps `$framesz+16*0`(%rsp),%xmm6
619 movaps `$framesz+16*1`(%rsp),%xmm7
620 movaps `$framesz+16*2`(%rsp),%xmm8
621 movaps `$framesz+16*3`(%rsp),%xmm9
622 movaps `$framesz+16*4`(%rsp),%xmm10
623 movaps `$framesz+16*5`(%rsp),%xmm11
624 movaps `$framesz+16*6`(%rsp),%xmm12
625 movaps `$framesz+16*7`(%rsp),%xmm13
626 movaps `$framesz+16*8`(%rsp),%xmm14
627 movaps `$framesz+16*9`(%rsp),%xmm15
643 .cfi_def_cfa_register %rsp
647 .size ${func}_xop,.-${func}_xop
649 ######################################################################
652 local *ror = sub { &shrd(@_[0],@_) };
655 .type ${func}_avx,\@function,6
660 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
661 mov %rsp,%rax # copy %rsp
662 .cfi_def_cfa_register %rax
675 sub \$`$framesz+$win64*16*10`,%rsp
676 and \$-64,%rsp # align stack frame
679 sub $inp,$out # re-bias
681 add $inp,$len # end of input
683 #mov $inp,$_inp # saved later
686 #mov $key,$_key # remains resident in $inp register
691 .cfi_cfa_expression $_rsp,deref,+8
693 $code.=<<___ if ($win64);
694 movaps %xmm6,`$framesz+16*0`(%rsp)
695 movaps %xmm7,`$framesz+16*1`(%rsp)
696 movaps %xmm8,`$framesz+16*2`(%rsp)
697 movaps %xmm9,`$framesz+16*3`(%rsp)
698 movaps %xmm10,`$framesz+16*4`(%rsp)
699 movaps %xmm11,`$framesz+16*5`(%rsp)
700 movaps %xmm12,`$framesz+16*6`(%rsp)
701 movaps %xmm13,`$framesz+16*7`(%rsp)
702 movaps %xmm14,`$framesz+16*8`(%rsp)
703 movaps %xmm15,`$framesz+16*9`(%rsp)
709 mov $inp,%r12 # borrow $a4
710 lea 0x80($key),$inp # size optimization, reassign
711 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
712 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
713 mov $ctx,%r15 # borrow $a2
714 mov $in0,%rsi # borrow $a3
715 vmovdqu ($ivp),$iv # load IV
727 vmovdqa 0x00(%r13,%r14,8),$mask14
728 vmovdqa 0x10(%r13,%r14,8),$mask12
729 vmovdqa 0x20(%r13,%r14,8),$mask10
730 vmovdqu 0x00-0x80($inp),$roundkey
732 if ($SZ==4) { # SHA256
733 my @X = map("%xmm$_",(0..3));
734 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
740 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
741 vmovdqu 0x00(%rsi,%r12),@X[0]
742 vmovdqu 0x10(%rsi,%r12),@X[1]
743 vmovdqu 0x20(%rsi,%r12),@X[2]
744 vmovdqu 0x30(%rsi,%r12),@X[3]
745 vpshufb $t3,@X[0],@X[0]
746 lea $TABLE(%rip),$Tbl
747 vpshufb $t3,@X[1],@X[1]
748 vpshufb $t3,@X[2],@X[2]
749 vpaddd 0x00($Tbl),@X[0],$t0
750 vpshufb $t3,@X[3],@X[3]
751 vpaddd 0x20($Tbl),@X[1],$t1
752 vpaddd 0x40($Tbl),@X[2],$t2
753 vpaddd 0x60($Tbl),@X[3],$t3
754 vmovdqa $t0,0x00(%rsp)
756 vmovdqa $t1,0x10(%rsp)
758 vmovdqa $t2,0x20(%rsp)
760 vmovdqa $t3,0x30(%rsp)
766 sub \$-16*2*$SZ,$Tbl # size optimization
767 vmovdqu (%r12),$inout # $a4
770 sub Xupdate_256_AVX () {
772 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
773 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
774 '&vpsrld ($t2,$t0,$sigma0[0]);',
775 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
776 '&vpsrld ($t3,$t0,$sigma0[2])',
777 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
778 '&vpxor ($t0,$t3,$t2)',
779 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
780 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
781 '&vpxor ($t0,$t0,$t1)',
782 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
783 '&vpxor ($t0,$t0,$t2)',
784 '&vpsrld ($t2,$t3,$sigma1[2]);',
785 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
786 '&vpsrlq ($t3,$t3,$sigma1[0]);',
787 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
788 '&vpxor ($t2,$t2,$t3);',
789 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
790 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
791 '&vpshufd ($t2,$t2,0b10000100)',
792 '&vpsrldq ($t2,$t2,8)',
793 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
794 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
795 '&vpsrld ($t2,$t3,$sigma1[2])',
796 '&vpsrlq ($t3,$t3,$sigma1[0])',
797 '&vpxor ($t2,$t2,$t3);',
798 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
799 '&vpxor ($t2,$t2,$t3)',
800 '&vpshufd ($t2,$t2,0b11101000)',
801 '&vpslldq ($t2,$t2,8)',
802 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
806 sub AVX_256_00_47 () {
810 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
812 foreach (Xupdate_256_AVX()) { # 29 instructions
818 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
819 foreach (@insns) { eval; } # remaining instructions
820 &vmovdqa (16*$j."(%rsp)",$t2);
824 for ($i=0,$j=0; $j<4; $j++) {
825 &AVX_256_00_47($j,\&body_00_15,@X);
826 push(@X,shift(@X)); # rotate(@X)
828 &mov ("%r12",$_inp); # borrow $a4
829 &vpand ($temp,$temp,$mask14);
830 &mov ("%r15",$_out); # borrow $a2
831 &vpor ($iv,$iv,$temp);
832 &vmovdqu ("(%r15,%r12)",$iv); # write output
833 &lea ("%r12","16(%r12)"); # inp++
835 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
836 &jne (".Lavx_00_47");
838 &vmovdqu ($inout,"(%r12)");
842 for ($i=0; $i<16; ) {
843 foreach(body_00_15()) { eval; }
848 mov $_inp,%r12 # borrow $a4
849 mov $_out,%r13 # borrow $a0
850 mov $_ctx,%r15 # borrow $a2
851 mov $_in0,%rsi # borrow $a3
853 vpand $mask14,$temp,$temp
856 vmovdqu $iv,(%r13,%r12) # write output
857 lea 16(%r12),%r12 # inp++
883 vmovdqu $iv,($ivp) # output IV
886 $code.=<<___ if ($win64);
887 movaps `$framesz+16*0`(%rsp),%xmm6
888 movaps `$framesz+16*1`(%rsp),%xmm7
889 movaps `$framesz+16*2`(%rsp),%xmm8
890 movaps `$framesz+16*3`(%rsp),%xmm9
891 movaps `$framesz+16*4`(%rsp),%xmm10
892 movaps `$framesz+16*5`(%rsp),%xmm11
893 movaps `$framesz+16*6`(%rsp),%xmm12
894 movaps `$framesz+16*7`(%rsp),%xmm13
895 movaps `$framesz+16*8`(%rsp),%xmm14
896 movaps `$framesz+16*9`(%rsp),%xmm15
912 .cfi_def_cfa_register %rsp
916 .size ${func}_avx,.-${func}_avx
920 ######################################################################
923 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
928 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
930 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
932 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
933 '&and ($a4,$e)', # f&e
934 '&rorx ($a0,$e,$Sigma1[2])',
935 '&rorx ($a2,$e,$Sigma1[1])',
937 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
938 '&lea ($h,"($h,$a4)")',
939 '&andn ($a4,$e,$g)', # ~e&g
942 '&rorx ($a1,$e,$Sigma1[0])',
943 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
944 '&xor ($a0,$a1)', # Sigma1(e)
947 '&rorx ($a4,$a,$Sigma0[2])',
948 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
949 '&xor ($a2,$b)', # a^b, b^c in next round
950 '&rorx ($a1,$a,$Sigma0[1])',
952 '&rorx ($a0,$a,$Sigma0[0])',
953 '&lea ($d,"($d,$h)")', # d+=h
954 '&and ($a3,$a2)', # (b^c)&(a^b)
955 @aesni_cbc_block[$aesni_cbc_idx++].
958 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
959 '&xor ($a1,$a0)', # Sigma0(a)
960 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
961 '&mov ($a4,$e)', # copy of f in future
963 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
965 # and at the finish one has to $a+=$a1
969 .type ${func}_avx2,\@function,6
974 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
975 mov %rsp,%rax # copy %rsp
976 .cfi_def_cfa_register %rax
989 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
990 and \$-256*$SZ,%rsp # align stack frame
991 add \$`2*$SZ*($rounds-8)`,%rsp
994 sub $inp,$out # re-bias
996 add $inp,$len # end of input
998 #mov $inp,$_inp # saved later
999 #mov $out,$_out # kept in $offload
1001 #mov $key,$_key # remains resident in $inp register
1006 .cfi_cfa_expression $_rsp,deref,+8
1008 $code.=<<___ if ($win64);
1009 movaps %xmm6,`$framesz+16*0`(%rsp)
1010 movaps %xmm7,`$framesz+16*1`(%rsp)
1011 movaps %xmm8,`$framesz+16*2`(%rsp)
1012 movaps %xmm9,`$framesz+16*3`(%rsp)
1013 movaps %xmm10,`$framesz+16*4`(%rsp)
1014 movaps %xmm11,`$framesz+16*5`(%rsp)
1015 movaps %xmm12,`$framesz+16*6`(%rsp)
1016 movaps %xmm13,`$framesz+16*7`(%rsp)
1017 movaps %xmm14,`$framesz+16*8`(%rsp)
1018 movaps %xmm15,`$framesz+16*9`(%rsp)
1024 mov $inp,%r13 # borrow $a0
1025 vpinsrq \$1,$out,$offload,$offload
1026 lea 0x80($key),$inp # size optimization, reassign
1027 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
1028 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
1029 mov $ctx,%r15 # borrow $a2
1030 mov $in0,%rsi # borrow $a3
1031 vmovdqu ($ivp),$iv # load IV
1034 vmovdqa 0x00(%r12,%r14,8),$mask14
1035 vmovdqa 0x10(%r12,%r14,8),$mask12
1036 vmovdqa 0x20(%r12,%r14,8),$mask10
1038 sub \$-16*$SZ,%r13 # inp++, size optimization
1040 lea (%rsi,%r13),%r12 # borrow $a0
1042 cmp $len,%r13 # $_end
1044 cmove %rsp,%r12 # next block or random data
1050 vmovdqu 0x00-0x80($inp),$roundkey
1052 if ($SZ==4) { # SHA256
1053 my @X = map("%ymm$_",(0..3));
1054 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1060 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1061 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1062 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1063 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1064 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1066 vinserti128 \$1,(%r12),@X[0],@X[0]
1067 vinserti128 \$1,16(%r12),@X[1],@X[1]
1068 vpshufb $t3,@X[0],@X[0]
1069 vinserti128 \$1,32(%r12),@X[2],@X[2]
1070 vpshufb $t3,@X[1],@X[1]
1071 vinserti128 \$1,48(%r12),@X[3],@X[3]
1073 lea $TABLE(%rip),$Tbl
1074 vpshufb $t3,@X[2],@X[2]
1075 lea -16*$SZ(%r13),%r13
1076 vpaddd 0x00($Tbl),@X[0],$t0
1077 vpshufb $t3,@X[3],@X[3]
1078 vpaddd 0x20($Tbl),@X[1],$t1
1079 vpaddd 0x40($Tbl),@X[2],$t2
1080 vpaddd 0x60($Tbl),@X[3],$t3
1081 vmovdqa $t0,0x00(%rsp)
1083 vmovdqa $t1,0x20(%rsp)
1084 lea -$PUSH8(%rsp),%rsp
1086 vmovdqa $t2,0x00(%rsp)
1088 vmovdqa $t3,0x20(%rsp)
1090 sub \$-16*2*$SZ,$Tbl # size optimization
1095 vmovdqu (%r13),$inout
1096 vpinsrq \$0,%r13,$offload,$offload
1099 sub AVX2_256_00_47 () {
1103 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1104 my $base = "+2*$PUSH8(%rsp)";
1106 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1107 foreach (Xupdate_256_AVX()) { # 29 instructions
1109 eval(shift(@insns));
1110 eval(shift(@insns));
1111 eval(shift(@insns));
1113 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1114 foreach (@insns) { eval; } # remaining instructions
1115 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1118 for ($i=0,$j=0; $j<4; $j++) {
1119 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1120 push(@X,shift(@X)); # rotate(@X)
1122 &vmovq ("%r13",$offload); # borrow $a0
1123 &vpextrq ("%r15",$offload,1); # borrow $a2
1124 &vpand ($temp,$temp,$mask14);
1125 &vpor ($iv,$iv,$temp);
1126 &vmovdqu ("(%r15,%r13)",$iv); # write output
1127 &lea ("%r13","16(%r13)"); # inp++
1129 &lea ($Tbl,16*2*$SZ."($Tbl)");
1130 &cmpb (($SZ-1)."($Tbl)",0);
1131 &jne (".Lavx2_00_47");
1133 &vmovdqu ($inout,"(%r13)");
1134 &vpinsrq ($offload,$offload,"%r13",0);
1137 for ($i=0; $i<16; ) {
1138 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1139 foreach(bodyx_00_15()) { eval; }
1143 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1144 vmovq $offload,%r13 # $_inp, borrow $a0
1145 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1147 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1149 vpand $mask14,$temp,$temp
1151 vmovdqu $iv,(%r12,%r13) # write output
1172 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1182 vmovdqu (%r13),$inout
1183 vpinsrq \$0,%r13,$offload,$offload
1186 for ($i=0; $i<16; ) {
1187 my $base="+16($Tbl)";
1188 foreach(bodyx_00_15()) { eval; }
1189 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1192 vmovq $offload,%r13 # borrow $a0
1193 vpextrq \$1,$offload,%r15 # borrow $a2
1194 vpand $mask14,$temp,$temp
1196 lea -$PUSH8($Tbl),$Tbl
1197 vmovdqu $iv,(%r15,%r13) # write output
1198 lea 16(%r13),%r13 # inp++
1202 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1203 lea 16*$SZ(%r13),%r13
1204 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1206 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1215 lea (%rsi,%r13),%r12
1221 cmove %rsp,%r12 # next block or stale data
1238 vmovdqu $iv,($ivp) # output IV
1241 $code.=<<___ if ($win64);
1242 movaps `$framesz+16*0`(%rsp),%xmm6
1243 movaps `$framesz+16*1`(%rsp),%xmm7
1244 movaps `$framesz+16*2`(%rsp),%xmm8
1245 movaps `$framesz+16*3`(%rsp),%xmm9
1246 movaps `$framesz+16*4`(%rsp),%xmm10
1247 movaps `$framesz+16*5`(%rsp),%xmm11
1248 movaps `$framesz+16*6`(%rsp),%xmm12
1249 movaps `$framesz+16*7`(%rsp),%xmm13
1250 movaps `$framesz+16*8`(%rsp),%xmm14
1251 movaps `$framesz+16*9`(%rsp),%xmm15
1267 .cfi_def_cfa_register %rsp
1271 .size ${func}_avx2,.-${func}_avx2
1276 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1278 my ($rounds,$Tbl)=("%r11d","%rbx");
1280 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1281 my @rndkey=("%xmm4","%xmm5");
1285 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1286 my @MSG=map("%xmm$_",(10..13));
1290 my ($n,$k)=($r/10,$r%10);
1293 movups `16*$n`($in0),$in # load input
1296 $code.=<<___ if ($n);
1297 movups $iv,`16*($n-1)`($out,$in0) # write output
1301 movups `32+16*$k-112`($key),$rndkey[1]
1302 aesenc $rndkey[0],$iv
1309 movups `32+16*($k+0)-112`($key),$rndkey[1]
1310 aesenc $rndkey[0],$iv
1311 movups `32+16*($k+1)-112`($key),$rndkey[0]
1312 aesenc $rndkey[1],$iv
1314 movups `32+16*($k+2)-112`($key),$rndkey[1]
1315 aesenc $rndkey[0],$iv
1316 movups `32+16*($k+3)-112`($key),$rndkey[0]
1317 aesenc $rndkey[1],$iv
1319 aesenclast $rndkey[0],$iv
1320 movups 16-112($key),$rndkey[1] # forward reference
1325 movups `32+16*$k-112`($key),$rndkey[1]
1326 aesenc $rndkey[0],$iv
1329 $r++; unshift(@rndkey,pop(@rndkey));
1336 .type ${func}_shaext,\@function,6
1339 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
1341 $code.=<<___ if ($win64);
1342 lea `-8-10*16`(%rsp),%rsp
1343 movaps %xmm6,-8-10*16(%rax)
1344 movaps %xmm7,-8-9*16(%rax)
1345 movaps %xmm8,-8-8*16(%rax)
1346 movaps %xmm9,-8-7*16(%rax)
1347 movaps %xmm10,-8-6*16(%rax)
1348 movaps %xmm11,-8-5*16(%rax)
1349 movaps %xmm12,-8-4*16(%rax)
1350 movaps %xmm13,-8-3*16(%rax)
1351 movaps %xmm14,-8-2*16(%rax)
1352 movaps %xmm15,-8-1*16(%rax)
1356 lea K256+0x80(%rip),$Tbl
1357 movdqu ($ctx),$ABEF # DCBA
1358 movdqu 16($ctx),$CDGH # HGFE
1359 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1361 mov 240($key),$rounds
1363 movups ($key),$rndkey0 # $key[0]
1364 movups 16($key),$rndkey[0] # forward reference
1365 lea 112($key),$key # size optimization
1367 pshufd \$0x1b,$ABEF,$Wi # ABCD
1368 pshufd \$0xb1,$ABEF,$ABEF # CDAB
1369 pshufd \$0x1b,$CDGH,$CDGH # EFGH
1370 movdqa $TMP,$BSWAP # offload
1371 palignr \$8,$CDGH,$ABEF # ABEF
1372 punpcklqdq $Wi,$CDGH # CDGH
1378 movdqu ($inp),@MSG[0]
1379 movdqu 0x10($inp),@MSG[1]
1380 movdqu 0x20($inp),@MSG[2]
1382 movdqu 0x30($inp),@MSG[3]
1384 movdqa 0*32-0x80($Tbl),$Wi
1387 movdqa $CDGH,$CDGH_SAVE # offload
1388 movdqa $ABEF,$ABEF_SAVE # offload
1392 sha256rnds2 $ABEF,$CDGH # 0-3
1393 pshufd \$0x0e,$Wi,$Wi
1397 sha256rnds2 $CDGH,$ABEF
1399 movdqa 1*32-0x80($Tbl),$Wi
1406 sha256rnds2 $ABEF,$CDGH # 4-7
1407 pshufd \$0x0e,$Wi,$Wi
1411 sha256rnds2 $CDGH,$ABEF
1413 movdqa 2*32-0x80($Tbl),$Wi
1416 sha256msg1 @MSG[1],@MSG[0]
1420 sha256rnds2 $ABEF,$CDGH # 8-11
1421 pshufd \$0x0e,$Wi,$Wi
1423 palignr \$4,@MSG[2],$TMP
1428 sha256rnds2 $CDGH,$ABEF
1430 movdqa 3*32-0x80($Tbl),$Wi
1432 sha256msg2 @MSG[3],@MSG[0]
1433 sha256msg1 @MSG[2],@MSG[1]
1437 sha256rnds2 $ABEF,$CDGH # 12-15
1438 pshufd \$0x0e,$Wi,$Wi
1443 palignr \$4,@MSG[3],$TMP
1445 sha256rnds2 $CDGH,$ABEF
1447 for($i=4;$i<16-3;$i++) {
1448 &$aesenc() if (($r%10)==0);
1450 movdqa $i*32-0x80($Tbl),$Wi
1452 sha256msg2 @MSG[0],@MSG[1]
1453 sha256msg1 @MSG[3],@MSG[2]
1457 sha256rnds2 $ABEF,$CDGH # 16-19...
1458 pshufd \$0x0e,$Wi,$Wi
1460 palignr \$4,@MSG[0],$TMP
1464 &$aesenc() if ($r==19);
1466 sha256rnds2 $CDGH,$ABEF
1468 push(@MSG,shift(@MSG));
1471 movdqa 13*32-0x80($Tbl),$Wi
1473 sha256msg2 @MSG[0],@MSG[1]
1474 sha256msg1 @MSG[3],@MSG[2]
1478 sha256rnds2 $ABEF,$CDGH # 52-55
1479 pshufd \$0x0e,$Wi,$Wi
1481 palignr \$4,@MSG[0],$TMP
1487 sha256rnds2 $CDGH,$ABEF
1489 movdqa 14*32-0x80($Tbl),$Wi
1491 sha256msg2 @MSG[1],@MSG[2]
1496 sha256rnds2 $ABEF,$CDGH # 56-59
1497 pshufd \$0x0e,$Wi,$Wi
1501 sha256rnds2 $CDGH,$ABEF
1503 movdqa 15*32-0x80($Tbl),$Wi
1509 sha256rnds2 $ABEF,$CDGH # 60-63
1510 pshufd \$0x0e,$Wi,$Wi
1514 sha256rnds2 $CDGH,$ABEF
1515 #pxor $CDGH,$rndkey0 # black magic
1517 while ($r<40) { &$aesenc(); } # remaining aesenc's
1519 #xorps $CDGH,$rndkey0 # black magic
1520 paddd $CDGH_SAVE,$CDGH
1521 paddd $ABEF_SAVE,$ABEF
1524 movups $iv,48($out,$in0) # write output
1528 pshufd \$0xb1,$CDGH,$CDGH # DCHG
1529 pshufd \$0x1b,$ABEF,$TMP # FEBA
1530 pshufd \$0xb1,$ABEF,$ABEF # BAFE
1531 punpckhqdq $CDGH,$ABEF # DCBA
1532 palignr \$8,$TMP,$CDGH # HGFE
1534 movups $iv,($ivp) # write IV
1536 movdqu $CDGH,16($ctx)
1538 $code.=<<___ if ($win64);
1539 movaps 0*16(%rsp),%xmm6
1540 movaps 1*16(%rsp),%xmm7
1541 movaps 2*16(%rsp),%xmm8
1542 movaps 3*16(%rsp),%xmm9
1543 movaps 4*16(%rsp),%xmm10
1544 movaps 5*16(%rsp),%xmm11
1545 movaps 6*16(%rsp),%xmm12
1546 movaps 7*16(%rsp),%xmm13
1547 movaps 8*16(%rsp),%xmm14
1548 movaps 9*16(%rsp),%xmm15
1549 lea 8+10*16(%rsp),%rsp
1554 .size ${func}_shaext,.-${func}_shaext
1559 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1560 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1561 if ($win64 && $avx) {
1568 .extern __imp_RtlVirtualUnwind
1569 .type se_handler,\@abi-omnipotent
1583 mov 120($context),%rax # pull context->Rax
1584 mov 248($context),%rbx # pull context->Rip
1586 mov 8($disp),%rsi # disp->ImageBase
1587 mov 56($disp),%r11 # disp->HanderlData
1589 mov 0(%r11),%r10d # HandlerData[0]
1590 lea (%rsi,%r10),%r10 # prologue label
1591 cmp %r10,%rbx # context->Rip<prologue label
1594 mov 152($context),%rax # pull context->Rsp
1596 mov 4(%r11),%r10d # HandlerData[1]
1597 lea (%rsi,%r10),%r10 # epilogue label
1598 cmp %r10,%rbx # context->Rip>=epilogue label
1601 $code.=<<___ if ($shaext);
1602 lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1607 lea 512($context),%rdi # &context.Xmm6
1609 .long 0xa548f3fc # cld; rep movsq
1610 lea 168(%rax),%rax # adjust stack pointer
1614 $code.=<<___ if ($avx>1);
1615 lea .Lavx2_shortcut(%rip),%r10
1616 cmp %r10,%rbx # context->Rip<avx2_shortcut
1620 add \$`2*$SZ*($rounds-8)`,%rax
1624 mov %rax,%rsi # put aside Rsp
1625 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1633 mov %rbx,144($context) # restore context->Rbx
1634 mov %rbp,160($context) # restore context->Rbp
1635 mov %r12,216($context) # restore context->R12
1636 mov %r13,224($context) # restore context->R13
1637 mov %r14,232($context) # restore context->R14
1638 mov %r15,240($context) # restore context->R15
1640 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1641 lea 512($context),%rdi # &context.Xmm6
1643 .long 0xa548f3fc # cld; rep movsq
1648 mov %rax,152($context) # restore context->Rsp
1649 mov %rsi,168($context) # restore context->Rsi
1650 mov %rdi,176($context) # restore context->Rdi
1652 mov 40($disp),%rdi # disp->ContextRecord
1653 mov $context,%rsi # context
1654 mov \$154,%ecx # sizeof(CONTEXT)
1655 .long 0xa548f3fc # cld; rep movsq
1658 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1659 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1660 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1661 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1662 mov 40(%rsi),%r10 # disp->ContextRecord
1663 lea 56(%rsi),%r11 # &disp->HandlerData
1664 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1665 mov %r10,32(%rsp) # arg5
1666 mov %r11,40(%rsp) # arg6
1667 mov %r12,48(%rsp) # arg7
1668 mov %rcx,56(%rsp) # arg8, (NULL)
1669 call *__imp_RtlVirtualUnwind(%rip)
1671 mov \$1,%eax # ExceptionContinueSearch
1683 .size se_handler,.-se_handler
1686 .rva .LSEH_begin_${func}_xop
1687 .rva .LSEH_end_${func}_xop
1688 .rva .LSEH_info_${func}_xop
1690 .rva .LSEH_begin_${func}_avx
1691 .rva .LSEH_end_${func}_avx
1692 .rva .LSEH_info_${func}_avx
1694 $code.=<<___ if ($avx>1);
1695 .rva .LSEH_begin_${func}_avx2
1696 .rva .LSEH_end_${func}_avx2
1697 .rva .LSEH_info_${func}_avx2
1699 $code.=<<___ if ($shaext);
1700 .rva .LSEH_begin_${func}_shaext
1701 .rva .LSEH_end_${func}_shaext
1702 .rva .LSEH_info_${func}_shaext
1707 .LSEH_info_${func}_xop:
1710 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1712 .LSEH_info_${func}_avx:
1715 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1717 $code.=<<___ if ($avx>1);
1718 .LSEH_info_${func}_avx2:
1721 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1723 $code.=<<___ if ($shaext);
1724 .LSEH_info_${func}_shaext:
1727 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1731 ####################################################################
1733 local *opcode=shift;
1737 $rex|=0x04 if($dst>=8);
1738 $rex|=0x01 if($src>=8);
1739 unshift @opcode,$rex|0x40 if($rex);
1744 "sha256rnds2" => 0xcb,
1745 "sha256msg1" => 0xcc,
1746 "sha256msg2" => 0xcd );
1751 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1752 my @opcode=(0x0f,0x38);
1753 rex(\@opcode,$2,$1);
1754 push @opcode,$opcodelet{$instr};
1755 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1756 return ".byte\t".join(',',@opcode);
1758 return $instr."\t".@_[0];
1763 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1764 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;