2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
21 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
22 # parallelism, interleaving it with another algorithm would allow to
23 # utilize processor resources better and achieve better performance.
24 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25 # AESNI code is weaved into it. As SHA256 dominates execution time,
26 # stitch performance does not depend on AES key length. Below are
27 # performance numbers in cycles per processed byte, less is better,
28 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
31 # AES-128/-192/-256+SHA256 this(**) gain
32 # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
33 # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
34 # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
35 # Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
36 # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
37 # Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54%
38 # Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60%
40 # (*) there are XOP, AVX1 and AVX2 code paths, meaning that
41 # Westmere is omitted from loop, this is because gain was not
42 # estimated high enough to justify the effort;
43 # (**) these are EVP-free results, results obtained with 'speed
44 # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
45 # (***) these are SHAEXT results;
49 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
51 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
53 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
54 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
55 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
56 die "can't locate x86_64-xlate.pl";
58 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
59 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
60 $avx = ($1>=2.19) + ($1>=2.22);
63 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
64 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
65 $avx = ($1>=2.09) + ($1>=2.10);
68 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
69 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
70 $avx = ($1>=10) + ($1>=12);
73 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
74 $avx = ($2>=3.0) + ($2>3.0);
77 $shaext=$avx; ### set to zero if compiling for 1.0.1
78 $avx=1 if (!$shaext && $avx);
80 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
83 $func="aesni_cbc_sha256_enc";
86 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
87 "%r8d","%r9d","%r10d","%r11d");
88 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
95 ########################################################################
96 # void aesni_cbc_sha256_enc(const void *inp,
103 ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
104 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
108 $_inp="16*$SZ+0*8(%rsp)";
109 $_out="16*$SZ+1*8(%rsp)";
110 $_end="16*$SZ+2*8(%rsp)";
111 $_key="16*$SZ+3*8(%rsp)";
112 $_ivp="16*$SZ+4*8(%rsp)";
113 $_ctx="16*$SZ+5*8(%rsp)";
114 $_in0="16*$SZ+6*8(%rsp)";
115 $_rsp="`16*$SZ+7*8`(%rsp)";
121 .extern OPENSSL_ia32cap_P
123 .type $func,\@abi-omnipotent
129 lea OPENSSL_ia32cap_P(%rip),%r11
131 cmp \$0,`$win64?"%rcx":"%rdi"`
136 $code.=<<___ if ($shaext);
137 bt \$61,%r10 # check for SHA
144 test \$`1<<11`,%r10d # check for XOP
147 $code.=<<___ if ($avx>1);
148 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
149 cmp \$`1<<8|1<<5|1<<3`,%r11d
153 and \$`1<<28`,%r10d # check for AVX
160 cmp \$0,`$win64?"%rcx":"%rdi"`
168 .type $TABLE,\@object
170 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
171 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
172 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
173 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
174 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
175 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
176 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
177 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
178 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
179 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
180 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
181 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
182 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
183 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
184 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
185 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
186 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
187 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
188 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
189 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
190 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
191 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
192 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
193 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
194 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
195 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
196 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
197 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
198 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
199 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
200 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
201 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
203 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
204 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
205 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
206 .long 0,0,0,0, 0,0,0,0
207 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
211 ######################################################################
215 ($iv,$inout,$roundkey,$temp,
216 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
220 ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
221 ## &vmovdqu ($inout,($inp));
222 ## &mov ($_inp,$inp);
224 '&vpxor ($inout,$inout,$roundkey);'.
225 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
227 '&vpxor ($inout,$inout,$iv);',
229 '&vaesenc ($inout,$inout,$roundkey);'.
230 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
232 '&vaesenc ($inout,$inout,$roundkey);'.
233 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
235 '&vaesenc ($inout,$inout,$roundkey);'.
236 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
238 '&vaesenc ($inout,$inout,$roundkey);'.
239 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
241 '&vaesenc ($inout,$inout,$roundkey);'.
242 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
244 '&vaesenc ($inout,$inout,$roundkey);'.
245 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
247 '&vaesenc ($inout,$inout,$roundkey);'.
248 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
250 '&vaesenc ($inout,$inout,$roundkey);'.
251 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
253 '&vaesenc ($inout,$inout,$roundkey);'.
254 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
256 '&vaesenclast ($temp,$inout,$roundkey);'.
257 ' &vaesenc ($inout,$inout,$roundkey);'.
258 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
260 '&vpand ($iv,$temp,$mask10);'.
261 ' &vaesenc ($inout,$inout,$roundkey);'.
262 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
264 '&vaesenclast ($temp,$inout,$roundkey);'.
265 ' &vaesenc ($inout,$inout,$roundkey);'.
266 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
268 '&vpand ($temp,$temp,$mask12);'.
269 ' &vaesenc ($inout,$inout,$roundkey);'.
270 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
272 '&vpor ($iv,$iv,$temp);'.
273 ' &vaesenclast ($temp,$inout,$roundkey);'.
274 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
276 ## &mov ($inp,$_inp);
277 ## &mov ($out,$_out);
278 ## &vpand ($temp,$temp,$mask14);
279 ## &vpor ($iv,$iv,$temp);
280 ## &vmovdqu ($iv,($out,$inp);
281 ## &lea (inp,16($inp));
285 my ($a,$b,$c,$d,$e,$f,$g,$h);
287 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
288 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
290 $arg = "\$$arg" if ($arg*1 eq $arg);
291 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
296 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
298 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
303 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
304 '&xor ($a4,$g)', # f^g
306 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
308 '&and ($a4,$e)', # (f^g)&e
310 @aesni_cbc_block[$aesni_cbc_idx++].
312 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
315 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
316 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
317 '&xor ($a2,$b)', # a^b, b^c in next round
319 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
320 '&add ($h,$a4)', # h+=Ch(e,f,g)
321 '&and ($a3,$a2)', # (b^c)&(a^b)
324 '&add ($h,$a0)', # h+=Sigma1(e)
325 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
327 '&add ($d,$h)', # d+=h
328 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
329 '&add ($h,$a3)', # h+=Maj(a,b,c)
332 '&add ($a1,$h);'. # h+=Sigma0(a)
333 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
338 ######################################################################
342 .type ${func}_xop,\@function,6
347 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
348 mov %rsp,%rax # copy %rsp
349 .cfi_def_cfa_register %rax
362 sub \$`$framesz+$win64*16*10`,%rsp
363 and \$-64,%rsp # align stack frame
366 sub $inp,$out # re-bias
368 add $inp,$len # end of input
370 #mov $inp,$_inp # saved later
373 #mov $key,$_key # remains resident in $inp register
378 .cfi_cfa_expression $_rsp,deref,+8
380 $code.=<<___ if ($win64);
381 movaps %xmm6,`$framesz+16*0`(%rsp)
382 movaps %xmm7,`$framesz+16*1`(%rsp)
383 movaps %xmm8,`$framesz+16*2`(%rsp)
384 movaps %xmm9,`$framesz+16*3`(%rsp)
385 movaps %xmm10,`$framesz+16*4`(%rsp)
386 movaps %xmm11,`$framesz+16*5`(%rsp)
387 movaps %xmm12,`$framesz+16*6`(%rsp)
388 movaps %xmm13,`$framesz+16*7`(%rsp)
389 movaps %xmm14,`$framesz+16*8`(%rsp)
390 movaps %xmm15,`$framesz+16*9`(%rsp)
396 mov $inp,%r12 # borrow $a4
397 lea 0x80($key),$inp # size optimization, reassign
398 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
399 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
400 mov $ctx,%r15 # borrow $a2
401 mov $in0,%rsi # borrow $a3
402 vmovdqu ($ivp),$iv # load IV
414 vmovdqa 0x00(%r13,%r14,8),$mask14
415 vmovdqa 0x10(%r13,%r14,8),$mask12
416 vmovdqa 0x20(%r13,%r14,8),$mask10
417 vmovdqu 0x00-0x80($inp),$roundkey
420 if ($SZ==4) { # SHA256
421 my @X = map("%xmm$_",(0..3));
422 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
427 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
428 vmovdqu 0x00(%rsi,%r12),@X[0]
429 vmovdqu 0x10(%rsi,%r12),@X[1]
430 vmovdqu 0x20(%rsi,%r12),@X[2]
431 vmovdqu 0x30(%rsi,%r12),@X[3]
432 vpshufb $t3,@X[0],@X[0]
433 lea $TABLE(%rip),$Tbl
434 vpshufb $t3,@X[1],@X[1]
435 vpshufb $t3,@X[2],@X[2]
436 vpaddd 0x00($Tbl),@X[0],$t0
437 vpshufb $t3,@X[3],@X[3]
438 vpaddd 0x20($Tbl),@X[1],$t1
439 vpaddd 0x40($Tbl),@X[2],$t2
440 vpaddd 0x60($Tbl),@X[3],$t3
441 vmovdqa $t0,0x00(%rsp)
443 vmovdqa $t1,0x10(%rsp)
445 vmovdqa $t2,0x20(%rsp)
447 vmovdqa $t3,0x30(%rsp)
453 sub \$-16*2*$SZ,$Tbl # size optimization
454 vmovdqu (%r12),$inout # $a4
457 sub XOP_256_00_47 () {
461 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
463 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
466 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
469 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
472 &vpsrld ($t0,$t0,$sigma0[2]);
475 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
480 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
483 &vpxor ($t0,$t0,$t1);
488 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
491 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
494 &vpsrld ($t2,@X[3],$sigma1[2]);
497 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
500 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
503 &vpxor ($t3,$t3,$t2);
508 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
513 &vpsrldq ($t3,$t3,8);
518 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
523 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
526 &vpsrld ($t2,@X[0],$sigma1[2]);
529 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
532 &vpxor ($t3,$t3,$t2);
537 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
542 &vpslldq ($t3,$t3,8); # 22 instructions
547 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
552 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
553 foreach (@insns) { eval; } # remaining instructions
554 &vmovdqa (16*$j."(%rsp)",$t2);
558 for ($i=0,$j=0; $j<4; $j++) {
559 &XOP_256_00_47($j,\&body_00_15,@X);
560 push(@X,shift(@X)); # rotate(@X)
562 &mov ("%r12",$_inp); # borrow $a4
563 &vpand ($temp,$temp,$mask14);
564 &mov ("%r15",$_out); # borrow $a2
565 &vpor ($iv,$iv,$temp);
566 &vmovdqu ("(%r15,%r12)",$iv); # write output
567 &lea ("%r12","16(%r12)"); # inp++
569 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
570 &jne (".Lxop_00_47");
572 &vmovdqu ($inout,"(%r12)");
576 for ($i=0; $i<16; ) {
577 foreach(body_00_15()) { eval; }
581 mov $_inp,%r12 # borrow $a4
582 mov $_out,%r13 # borrow $a0
583 mov $_ctx,%r15 # borrow $a2
584 mov $_in0,%rsi # borrow $a3
586 vpand $mask14,$temp,$temp
589 vmovdqu $iv,(%r13,%r12) # write output
590 lea 16(%r12),%r12 # inp++
617 vmovdqu $iv,($ivp) # output IV
620 $code.=<<___ if ($win64);
621 movaps `$framesz+16*0`(%rsp),%xmm6
622 movaps `$framesz+16*1`(%rsp),%xmm7
623 movaps `$framesz+16*2`(%rsp),%xmm8
624 movaps `$framesz+16*3`(%rsp),%xmm9
625 movaps `$framesz+16*4`(%rsp),%xmm10
626 movaps `$framesz+16*5`(%rsp),%xmm11
627 movaps `$framesz+16*6`(%rsp),%xmm12
628 movaps `$framesz+16*7`(%rsp),%xmm13
629 movaps `$framesz+16*8`(%rsp),%xmm14
630 movaps `$framesz+16*9`(%rsp),%xmm15
646 .cfi_def_cfa_register %rsp
650 .size ${func}_xop,.-${func}_xop
652 ######################################################################
655 local *ror = sub { &shrd(@_[0],@_) };
658 .type ${func}_avx,\@function,6
663 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
664 mov %rsp,%rax # copy %rsp
665 .cfi_def_cfa_register %rax
678 sub \$`$framesz+$win64*16*10`,%rsp
679 and \$-64,%rsp # align stack frame
682 sub $inp,$out # re-bias
684 add $inp,$len # end of input
686 #mov $inp,$_inp # saved later
689 #mov $key,$_key # remains resident in $inp register
694 .cfi_cfa_expression $_rsp,deref,+8
696 $code.=<<___ if ($win64);
697 movaps %xmm6,`$framesz+16*0`(%rsp)
698 movaps %xmm7,`$framesz+16*1`(%rsp)
699 movaps %xmm8,`$framesz+16*2`(%rsp)
700 movaps %xmm9,`$framesz+16*3`(%rsp)
701 movaps %xmm10,`$framesz+16*4`(%rsp)
702 movaps %xmm11,`$framesz+16*5`(%rsp)
703 movaps %xmm12,`$framesz+16*6`(%rsp)
704 movaps %xmm13,`$framesz+16*7`(%rsp)
705 movaps %xmm14,`$framesz+16*8`(%rsp)
706 movaps %xmm15,`$framesz+16*9`(%rsp)
712 mov $inp,%r12 # borrow $a4
713 lea 0x80($key),$inp # size optimization, reassign
714 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
715 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
716 mov $ctx,%r15 # borrow $a2
717 mov $in0,%rsi # borrow $a3
718 vmovdqu ($ivp),$iv # load IV
730 vmovdqa 0x00(%r13,%r14,8),$mask14
731 vmovdqa 0x10(%r13,%r14,8),$mask12
732 vmovdqa 0x20(%r13,%r14,8),$mask10
733 vmovdqu 0x00-0x80($inp),$roundkey
735 if ($SZ==4) { # SHA256
736 my @X = map("%xmm$_",(0..3));
737 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
743 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
744 vmovdqu 0x00(%rsi,%r12),@X[0]
745 vmovdqu 0x10(%rsi,%r12),@X[1]
746 vmovdqu 0x20(%rsi,%r12),@X[2]
747 vmovdqu 0x30(%rsi,%r12),@X[3]
748 vpshufb $t3,@X[0],@X[0]
749 lea $TABLE(%rip),$Tbl
750 vpshufb $t3,@X[1],@X[1]
751 vpshufb $t3,@X[2],@X[2]
752 vpaddd 0x00($Tbl),@X[0],$t0
753 vpshufb $t3,@X[3],@X[3]
754 vpaddd 0x20($Tbl),@X[1],$t1
755 vpaddd 0x40($Tbl),@X[2],$t2
756 vpaddd 0x60($Tbl),@X[3],$t3
757 vmovdqa $t0,0x00(%rsp)
759 vmovdqa $t1,0x10(%rsp)
761 vmovdqa $t2,0x20(%rsp)
763 vmovdqa $t3,0x30(%rsp)
769 sub \$-16*2*$SZ,$Tbl # size optimization
770 vmovdqu (%r12),$inout # $a4
773 sub Xupdate_256_AVX () {
775 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
776 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
777 '&vpsrld ($t2,$t0,$sigma0[0]);',
778 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
779 '&vpsrld ($t3,$t0,$sigma0[2])',
780 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
781 '&vpxor ($t0,$t3,$t2)',
782 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
783 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
784 '&vpxor ($t0,$t0,$t1)',
785 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
786 '&vpxor ($t0,$t0,$t2)',
787 '&vpsrld ($t2,$t3,$sigma1[2]);',
788 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
789 '&vpsrlq ($t3,$t3,$sigma1[0]);',
790 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
791 '&vpxor ($t2,$t2,$t3);',
792 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
793 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
794 '&vpshufd ($t2,$t2,0b10000100)',
795 '&vpsrldq ($t2,$t2,8)',
796 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
797 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
798 '&vpsrld ($t2,$t3,$sigma1[2])',
799 '&vpsrlq ($t3,$t3,$sigma1[0])',
800 '&vpxor ($t2,$t2,$t3);',
801 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
802 '&vpxor ($t2,$t2,$t3)',
803 '&vpshufd ($t2,$t2,0b11101000)',
804 '&vpslldq ($t2,$t2,8)',
805 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
809 sub AVX_256_00_47 () {
813 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
815 foreach (Xupdate_256_AVX()) { # 29 instructions
821 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
822 foreach (@insns) { eval; } # remaining instructions
823 &vmovdqa (16*$j."(%rsp)",$t2);
827 for ($i=0,$j=0; $j<4; $j++) {
828 &AVX_256_00_47($j,\&body_00_15,@X);
829 push(@X,shift(@X)); # rotate(@X)
831 &mov ("%r12",$_inp); # borrow $a4
832 &vpand ($temp,$temp,$mask14);
833 &mov ("%r15",$_out); # borrow $a2
834 &vpor ($iv,$iv,$temp);
835 &vmovdqu ("(%r15,%r12)",$iv); # write output
836 &lea ("%r12","16(%r12)"); # inp++
838 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
839 &jne (".Lavx_00_47");
841 &vmovdqu ($inout,"(%r12)");
845 for ($i=0; $i<16; ) {
846 foreach(body_00_15()) { eval; }
851 mov $_inp,%r12 # borrow $a4
852 mov $_out,%r13 # borrow $a0
853 mov $_ctx,%r15 # borrow $a2
854 mov $_in0,%rsi # borrow $a3
856 vpand $mask14,$temp,$temp
859 vmovdqu $iv,(%r13,%r12) # write output
860 lea 16(%r12),%r12 # inp++
886 vmovdqu $iv,($ivp) # output IV
889 $code.=<<___ if ($win64);
890 movaps `$framesz+16*0`(%rsp),%xmm6
891 movaps `$framesz+16*1`(%rsp),%xmm7
892 movaps `$framesz+16*2`(%rsp),%xmm8
893 movaps `$framesz+16*3`(%rsp),%xmm9
894 movaps `$framesz+16*4`(%rsp),%xmm10
895 movaps `$framesz+16*5`(%rsp),%xmm11
896 movaps `$framesz+16*6`(%rsp),%xmm12
897 movaps `$framesz+16*7`(%rsp),%xmm13
898 movaps `$framesz+16*8`(%rsp),%xmm14
899 movaps `$framesz+16*9`(%rsp),%xmm15
915 .cfi_def_cfa_register %rsp
919 .size ${func}_avx,.-${func}_avx
923 ######################################################################
926 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
931 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
933 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
935 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
936 '&and ($a4,$e)', # f&e
937 '&rorx ($a0,$e,$Sigma1[2])',
938 '&rorx ($a2,$e,$Sigma1[1])',
940 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
941 '&lea ($h,"($h,$a4)")',
942 '&andn ($a4,$e,$g)', # ~e&g
945 '&rorx ($a1,$e,$Sigma1[0])',
946 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
947 '&xor ($a0,$a1)', # Sigma1(e)
950 '&rorx ($a4,$a,$Sigma0[2])',
951 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
952 '&xor ($a2,$b)', # a^b, b^c in next round
953 '&rorx ($a1,$a,$Sigma0[1])',
955 '&rorx ($a0,$a,$Sigma0[0])',
956 '&lea ($d,"($d,$h)")', # d+=h
957 '&and ($a3,$a2)', # (b^c)&(a^b)
958 @aesni_cbc_block[$aesni_cbc_idx++].
961 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
962 '&xor ($a1,$a0)', # Sigma0(a)
963 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
964 '&mov ($a4,$e)', # copy of f in future
966 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
968 # and at the finish one has to $a+=$a1
972 .type ${func}_avx2,\@function,6
977 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
978 mov %rsp,%rax # copy %rsp
979 .cfi_def_cfa_register %rax
992 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
993 and \$-256*$SZ,%rsp # align stack frame
994 add \$`2*$SZ*($rounds-8)`,%rsp
997 sub $inp,$out # re-bias
999 add $inp,$len # end of input
1001 #mov $inp,$_inp # saved later
1002 #mov $out,$_out # kept in $offload
1004 #mov $key,$_key # remains resident in $inp register
1009 .cfi_cfa_expression $_rsp,deref,+8
1011 $code.=<<___ if ($win64);
1012 movaps %xmm6,`$framesz+16*0`(%rsp)
1013 movaps %xmm7,`$framesz+16*1`(%rsp)
1014 movaps %xmm8,`$framesz+16*2`(%rsp)
1015 movaps %xmm9,`$framesz+16*3`(%rsp)
1016 movaps %xmm10,`$framesz+16*4`(%rsp)
1017 movaps %xmm11,`$framesz+16*5`(%rsp)
1018 movaps %xmm12,`$framesz+16*6`(%rsp)
1019 movaps %xmm13,`$framesz+16*7`(%rsp)
1020 movaps %xmm14,`$framesz+16*8`(%rsp)
1021 movaps %xmm15,`$framesz+16*9`(%rsp)
1027 mov $inp,%r13 # borrow $a0
1028 vpinsrq \$1,$out,$offload,$offload
1029 lea 0x80($key),$inp # size optimization, reassign
1030 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
1031 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
1032 mov $ctx,%r15 # borrow $a2
1033 mov $in0,%rsi # borrow $a3
1034 vmovdqu ($ivp),$iv # load IV
1037 vmovdqa 0x00(%r12,%r14,8),$mask14
1038 vmovdqa 0x10(%r12,%r14,8),$mask12
1039 vmovdqa 0x20(%r12,%r14,8),$mask10
1041 sub \$-16*$SZ,%r13 # inp++, size optimization
1043 lea (%rsi,%r13),%r12 # borrow $a0
1045 cmp $len,%r13 # $_end
1047 cmove %rsp,%r12 # next block or random data
1053 vmovdqu 0x00-0x80($inp),$roundkey
1055 if ($SZ==4) { # SHA256
1056 my @X = map("%ymm$_",(0..3));
1057 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1063 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1064 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1065 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1066 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1067 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1069 vinserti128 \$1,(%r12),@X[0],@X[0]
1070 vinserti128 \$1,16(%r12),@X[1],@X[1]
1071 vpshufb $t3,@X[0],@X[0]
1072 vinserti128 \$1,32(%r12),@X[2],@X[2]
1073 vpshufb $t3,@X[1],@X[1]
1074 vinserti128 \$1,48(%r12),@X[3],@X[3]
1076 lea $TABLE(%rip),$Tbl
1077 vpshufb $t3,@X[2],@X[2]
1078 lea -16*$SZ(%r13),%r13
1079 vpaddd 0x00($Tbl),@X[0],$t0
1080 vpshufb $t3,@X[3],@X[3]
1081 vpaddd 0x20($Tbl),@X[1],$t1
1082 vpaddd 0x40($Tbl),@X[2],$t2
1083 vpaddd 0x60($Tbl),@X[3],$t3
1084 vmovdqa $t0,0x00(%rsp)
1086 vmovdqa $t1,0x20(%rsp)
1087 lea -$PUSH8(%rsp),%rsp
1089 vmovdqa $t2,0x00(%rsp)
1091 vmovdqa $t3,0x20(%rsp)
1093 sub \$-16*2*$SZ,$Tbl # size optimization
1098 vmovdqu (%r13),$inout
1099 vpinsrq \$0,%r13,$offload,$offload
1102 sub AVX2_256_00_47 () {
1106 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1107 my $base = "+2*$PUSH8(%rsp)";
1109 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1110 foreach (Xupdate_256_AVX()) { # 29 instructions
1112 eval(shift(@insns));
1113 eval(shift(@insns));
1114 eval(shift(@insns));
1116 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1117 foreach (@insns) { eval; } # remaining instructions
1118 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1121 for ($i=0,$j=0; $j<4; $j++) {
1122 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1123 push(@X,shift(@X)); # rotate(@X)
1125 &vmovq ("%r13",$offload); # borrow $a0
1126 &vpextrq ("%r15",$offload,1); # borrow $a2
1127 &vpand ($temp,$temp,$mask14);
1128 &vpor ($iv,$iv,$temp);
1129 &vmovdqu ("(%r15,%r13)",$iv); # write output
1130 &lea ("%r13","16(%r13)"); # inp++
1132 &lea ($Tbl,16*2*$SZ."($Tbl)");
1133 &cmpb (($SZ-1)."($Tbl)",0);
1134 &jne (".Lavx2_00_47");
1136 &vmovdqu ($inout,"(%r13)");
1137 &vpinsrq ($offload,$offload,"%r13",0);
1140 for ($i=0; $i<16; ) {
1141 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1142 foreach(bodyx_00_15()) { eval; }
1146 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1147 vmovq $offload,%r13 # $_inp, borrow $a0
1148 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1150 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1152 vpand $mask14,$temp,$temp
1154 vmovdqu $iv,(%r12,%r13) # write output
1175 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1185 vmovdqu (%r13),$inout
1186 vpinsrq \$0,%r13,$offload,$offload
1189 for ($i=0; $i<16; ) {
1190 my $base="+16($Tbl)";
1191 foreach(bodyx_00_15()) { eval; }
1192 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1195 vmovq $offload,%r13 # borrow $a0
1196 vpextrq \$1,$offload,%r15 # borrow $a2
1197 vpand $mask14,$temp,$temp
1199 lea -$PUSH8($Tbl),$Tbl
1200 vmovdqu $iv,(%r15,%r13) # write output
1201 lea 16(%r13),%r13 # inp++
1205 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1206 lea 16*$SZ(%r13),%r13
1207 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1209 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1218 lea (%rsi,%r13),%r12
1224 cmove %rsp,%r12 # next block or stale data
1241 vmovdqu $iv,($ivp) # output IV
1244 $code.=<<___ if ($win64);
1245 movaps `$framesz+16*0`(%rsp),%xmm6
1246 movaps `$framesz+16*1`(%rsp),%xmm7
1247 movaps `$framesz+16*2`(%rsp),%xmm8
1248 movaps `$framesz+16*3`(%rsp),%xmm9
1249 movaps `$framesz+16*4`(%rsp),%xmm10
1250 movaps `$framesz+16*5`(%rsp),%xmm11
1251 movaps `$framesz+16*6`(%rsp),%xmm12
1252 movaps `$framesz+16*7`(%rsp),%xmm13
1253 movaps `$framesz+16*8`(%rsp),%xmm14
1254 movaps `$framesz+16*9`(%rsp),%xmm15
1270 .cfi_def_cfa_register %rsp
1274 .size ${func}_avx2,.-${func}_avx2
1279 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1281 my ($rounds,$Tbl)=("%r11d","%rbx");
1283 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1284 my @rndkey=("%xmm4","%xmm5");
1288 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1289 my @MSG=map("%xmm$_",(10..13));
1293 my ($n,$k)=($r/10,$r%10);
1296 movups `16*$n`($in0),$in # load input
1299 $code.=<<___ if ($n);
1300 movups $iv,`16*($n-1)`($out,$in0) # write output
1304 movups `32+16*$k-112`($key),$rndkey[1]
1305 aesenc $rndkey[0],$iv
1312 movups `32+16*($k+0)-112`($key),$rndkey[1]
1313 aesenc $rndkey[0],$iv
1314 movups `32+16*($k+1)-112`($key),$rndkey[0]
1315 aesenc $rndkey[1],$iv
1317 movups `32+16*($k+2)-112`($key),$rndkey[1]
1318 aesenc $rndkey[0],$iv
1319 movups `32+16*($k+3)-112`($key),$rndkey[0]
1320 aesenc $rndkey[1],$iv
1322 aesenclast $rndkey[0],$iv
1323 movups 16-112($key),$rndkey[1] # forward reference
1328 movups `32+16*$k-112`($key),$rndkey[1]
1329 aesenc $rndkey[0],$iv
1332 $r++; unshift(@rndkey,pop(@rndkey));
1339 .type ${func}_shaext,\@function,6
1342 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
1344 $code.=<<___ if ($win64);
1345 lea `-8-10*16`(%rsp),%rsp
1346 movaps %xmm6,-8-10*16(%rax)
1347 movaps %xmm7,-8-9*16(%rax)
1348 movaps %xmm8,-8-8*16(%rax)
1349 movaps %xmm9,-8-7*16(%rax)
1350 movaps %xmm10,-8-6*16(%rax)
1351 movaps %xmm11,-8-5*16(%rax)
1352 movaps %xmm12,-8-4*16(%rax)
1353 movaps %xmm13,-8-3*16(%rax)
1354 movaps %xmm14,-8-2*16(%rax)
1355 movaps %xmm15,-8-1*16(%rax)
1359 lea K256+0x80(%rip),$Tbl
1360 movdqu ($ctx),$ABEF # DCBA
1361 movdqu 16($ctx),$CDGH # HGFE
1362 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1364 mov 240($key),$rounds
1366 movups ($key),$rndkey0 # $key[0]
1367 movups ($ivp),$iv # load IV
1368 movups 16($key),$rndkey[0] # forward reference
1369 lea 112($key),$key # size optimization
1371 pshufd \$0x1b,$ABEF,$Wi # ABCD
1372 pshufd \$0xb1,$ABEF,$ABEF # CDAB
1373 pshufd \$0x1b,$CDGH,$CDGH # EFGH
1374 movdqa $TMP,$BSWAP # offload
1375 palignr \$8,$CDGH,$ABEF # ABEF
1376 punpcklqdq $Wi,$CDGH # CDGH
1382 movdqu ($inp),@MSG[0]
1383 movdqu 0x10($inp),@MSG[1]
1384 movdqu 0x20($inp),@MSG[2]
1386 movdqu 0x30($inp),@MSG[3]
1388 movdqa 0*32-0x80($Tbl),$Wi
1391 movdqa $CDGH,$CDGH_SAVE # offload
1392 movdqa $ABEF,$ABEF_SAVE # offload
1396 sha256rnds2 $ABEF,$CDGH # 0-3
1397 pshufd \$0x0e,$Wi,$Wi
1401 sha256rnds2 $CDGH,$ABEF
1403 movdqa 1*32-0x80($Tbl),$Wi
1410 sha256rnds2 $ABEF,$CDGH # 4-7
1411 pshufd \$0x0e,$Wi,$Wi
1415 sha256rnds2 $CDGH,$ABEF
1417 movdqa 2*32-0x80($Tbl),$Wi
1420 sha256msg1 @MSG[1],@MSG[0]
1424 sha256rnds2 $ABEF,$CDGH # 8-11
1425 pshufd \$0x0e,$Wi,$Wi
1427 palignr \$4,@MSG[2],$TMP
1432 sha256rnds2 $CDGH,$ABEF
1434 movdqa 3*32-0x80($Tbl),$Wi
1436 sha256msg2 @MSG[3],@MSG[0]
1437 sha256msg1 @MSG[2],@MSG[1]
1441 sha256rnds2 $ABEF,$CDGH # 12-15
1442 pshufd \$0x0e,$Wi,$Wi
1447 palignr \$4,@MSG[3],$TMP
1449 sha256rnds2 $CDGH,$ABEF
1451 for($i=4;$i<16-3;$i++) {
1452 &$aesenc() if (($r%10)==0);
1454 movdqa $i*32-0x80($Tbl),$Wi
1456 sha256msg2 @MSG[0],@MSG[1]
1457 sha256msg1 @MSG[3],@MSG[2]
1461 sha256rnds2 $ABEF,$CDGH # 16-19...
1462 pshufd \$0x0e,$Wi,$Wi
1464 palignr \$4,@MSG[0],$TMP
1468 &$aesenc() if ($r==19);
1470 sha256rnds2 $CDGH,$ABEF
1472 push(@MSG,shift(@MSG));
1475 movdqa 13*32-0x80($Tbl),$Wi
1477 sha256msg2 @MSG[0],@MSG[1]
1478 sha256msg1 @MSG[3],@MSG[2]
1482 sha256rnds2 $ABEF,$CDGH # 52-55
1483 pshufd \$0x0e,$Wi,$Wi
1485 palignr \$4,@MSG[0],$TMP
1491 sha256rnds2 $CDGH,$ABEF
1493 movdqa 14*32-0x80($Tbl),$Wi
1495 sha256msg2 @MSG[1],@MSG[2]
1500 sha256rnds2 $ABEF,$CDGH # 56-59
1501 pshufd \$0x0e,$Wi,$Wi
1505 sha256rnds2 $CDGH,$ABEF
1507 movdqa 15*32-0x80($Tbl),$Wi
1513 sha256rnds2 $ABEF,$CDGH # 60-63
1514 pshufd \$0x0e,$Wi,$Wi
1518 sha256rnds2 $CDGH,$ABEF
1519 #pxor $CDGH,$rndkey0 # black magic
1521 while ($r<40) { &$aesenc(); } # remaining aesenc's
1523 #xorps $CDGH,$rndkey0 # black magic
1524 paddd $CDGH_SAVE,$CDGH
1525 paddd $ABEF_SAVE,$ABEF
1528 movups $iv,48($out,$in0) # write output
1532 pshufd \$0xb1,$CDGH,$CDGH # DCHG
1533 pshufd \$0x1b,$ABEF,$TMP # FEBA
1534 pshufd \$0xb1,$ABEF,$ABEF # BAFE
1535 punpckhqdq $CDGH,$ABEF # DCBA
1536 palignr \$8,$TMP,$CDGH # HGFE
1538 movups $iv,($ivp) # write IV
1540 movdqu $CDGH,16($ctx)
1542 $code.=<<___ if ($win64);
1543 movaps 0*16(%rsp),%xmm6
1544 movaps 1*16(%rsp),%xmm7
1545 movaps 2*16(%rsp),%xmm8
1546 movaps 3*16(%rsp),%xmm9
1547 movaps 4*16(%rsp),%xmm10
1548 movaps 5*16(%rsp),%xmm11
1549 movaps 6*16(%rsp),%xmm12
1550 movaps 7*16(%rsp),%xmm13
1551 movaps 8*16(%rsp),%xmm14
1552 movaps 9*16(%rsp),%xmm15
1553 lea 8+10*16(%rsp),%rsp
1558 .size ${func}_shaext,.-${func}_shaext
1563 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1564 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1565 if ($win64 && $avx) {
1572 .extern __imp_RtlVirtualUnwind
1573 .type se_handler,\@abi-omnipotent
1587 mov 120($context),%rax # pull context->Rax
1588 mov 248($context),%rbx # pull context->Rip
1590 mov 8($disp),%rsi # disp->ImageBase
1591 mov 56($disp),%r11 # disp->HanderlData
1593 mov 0(%r11),%r10d # HandlerData[0]
1594 lea (%rsi,%r10),%r10 # prologue label
1595 cmp %r10,%rbx # context->Rip<prologue label
1598 mov 152($context),%rax # pull context->Rsp
1600 mov 4(%r11),%r10d # HandlerData[1]
1601 lea (%rsi,%r10),%r10 # epilogue label
1602 cmp %r10,%rbx # context->Rip>=epilogue label
1605 $code.=<<___ if ($shaext);
1606 lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1611 lea 512($context),%rdi # &context.Xmm6
1613 .long 0xa548f3fc # cld; rep movsq
1614 lea 168(%rax),%rax # adjust stack pointer
1618 $code.=<<___ if ($avx>1);
1619 lea .Lavx2_shortcut(%rip),%r10
1620 cmp %r10,%rbx # context->Rip<avx2_shortcut
1624 add \$`2*$SZ*($rounds-8)`,%rax
1628 mov %rax,%rsi # put aside Rsp
1629 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1637 mov %rbx,144($context) # restore context->Rbx
1638 mov %rbp,160($context) # restore context->Rbp
1639 mov %r12,216($context) # restore context->R12
1640 mov %r13,224($context) # restore context->R13
1641 mov %r14,232($context) # restore context->R14
1642 mov %r15,240($context) # restore context->R15
1644 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1645 lea 512($context),%rdi # &context.Xmm6
1647 .long 0xa548f3fc # cld; rep movsq
1652 mov %rax,152($context) # restore context->Rsp
1653 mov %rsi,168($context) # restore context->Rsi
1654 mov %rdi,176($context) # restore context->Rdi
1656 mov 40($disp),%rdi # disp->ContextRecord
1657 mov $context,%rsi # context
1658 mov \$154,%ecx # sizeof(CONTEXT)
1659 .long 0xa548f3fc # cld; rep movsq
1662 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1663 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1664 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1665 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1666 mov 40(%rsi),%r10 # disp->ContextRecord
1667 lea 56(%rsi),%r11 # &disp->HandlerData
1668 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1669 mov %r10,32(%rsp) # arg5
1670 mov %r11,40(%rsp) # arg6
1671 mov %r12,48(%rsp) # arg7
1672 mov %rcx,56(%rsp) # arg8, (NULL)
1673 call *__imp_RtlVirtualUnwind(%rip)
1675 mov \$1,%eax # ExceptionContinueSearch
1687 .size se_handler,.-se_handler
1690 .rva .LSEH_begin_${func}_xop
1691 .rva .LSEH_end_${func}_xop
1692 .rva .LSEH_info_${func}_xop
1694 .rva .LSEH_begin_${func}_avx
1695 .rva .LSEH_end_${func}_avx
1696 .rva .LSEH_info_${func}_avx
1698 $code.=<<___ if ($avx>1);
1699 .rva .LSEH_begin_${func}_avx2
1700 .rva .LSEH_end_${func}_avx2
1701 .rva .LSEH_info_${func}_avx2
1703 $code.=<<___ if ($shaext);
1704 .rva .LSEH_begin_${func}_shaext
1705 .rva .LSEH_end_${func}_shaext
1706 .rva .LSEH_info_${func}_shaext
1711 .LSEH_info_${func}_xop:
1714 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1716 .LSEH_info_${func}_avx:
1719 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1721 $code.=<<___ if ($avx>1);
1722 .LSEH_info_${func}_avx2:
1725 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1727 $code.=<<___ if ($shaext);
1728 .LSEH_info_${func}_shaext:
1731 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1735 ####################################################################
1737 local *opcode=shift;
1741 $rex|=0x04 if($dst>=8);
1742 $rex|=0x01 if($src>=8);
1743 unshift @opcode,$rex|0x40 if($rex);
1748 "sha256rnds2" => 0xcb,
1749 "sha256msg1" => 0xcc,
1750 "sha256msg2" => 0xcd );
1755 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1756 my @opcode=(0x0f,0x38);
1757 rex(\@opcode,$2,$1);
1758 push @opcode,$opcodelet{$instr};
1759 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1760 return ".byte\t".join(',',@opcode);
1762 return $instr."\t".@_[0];
1767 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1768 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;