3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
15 # parallelism, interleaving it with another algorithm would allow to
16 # utilize processor resources better and achieve better performance.
17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18 # AESNI code is weaved into it. As SHA256 dominates execution time,
19 # stitch performance does not depend on AES key length. Below are
20 # performance numbers in cycles per processed byte, less is better,
21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
24 # AES-128/-192/-256+SHA256 this(**)gain
25 # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
26 # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
27 # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
28 # Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
29 # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
31 # (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
32 # Westmere is omitted from loop, this is because gain was not
33 # estimated high enough to justify the effort;
34 # (**) these are EVP-free results, results obtained with 'speed
35 # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
39 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
41 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
43 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
44 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
45 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
46 die "can't locate x86_64-xlate.pl";
48 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
49 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
50 $avx = ($1>=2.19) + ($1>=2.22);
53 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
54 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
55 $avx = ($1>=2.09) + ($1>=2.10);
58 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
59 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
60 $avx = ($1>=10) + ($1>=12);
63 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
64 $avx = ($2>=3.0) + ($2>3.0);
67 $shaext=$avx; ### set to zero if compiling for 1.0.1
68 $avx=1 if (!$shaext && $avx);
70 open OUT,"| \"$^X\" $xlate $flavour $output";
73 $func="aesni_cbc_sha256_enc";
76 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
77 "%r8d","%r9d","%r10d","%r11d");
78 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
85 ########################################################################
86 # void aesni_cbc_sha256_enc(const void *inp,
93 ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
94 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
98 $_inp="16*$SZ+0*8(%rsp)";
99 $_out="16*$SZ+1*8(%rsp)";
100 $_end="16*$SZ+2*8(%rsp)";
101 $_key="16*$SZ+3*8(%rsp)";
102 $_ivp="16*$SZ+4*8(%rsp)";
103 $_ctx="16*$SZ+5*8(%rsp)";
104 $_in0="16*$SZ+6*8(%rsp)";
105 $_rsp="16*$SZ+7*8(%rsp)";
111 .extern OPENSSL_ia32cap_P
113 .type $func,\@abi-omnipotent
119 lea OPENSSL_ia32cap_P(%rip),%r11
121 cmp \$0,`$win64?"%rcx":"%rdi"`
126 $code.=<<___ if ($shaext);
127 bt \$61,%r10 # check for SHA
134 test \$`1<<11`,%r10d # check for XOP
137 $code.=<<___ if ($avx>1);
138 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
139 cmp \$`1<<8|1<<5|1<<3`,%r11d
143 and \$`1<<30`,%eax # mask "Intel CPU" bit
144 and \$`1<<28|1<<9`,%r10d # mask AVX+SSSE3 bits
146 cmp \$`1<<28|1<<9|1<<30`,%r10d
153 cmp \$0,`$win64?"%rcx":"%rdi"`
161 .type $TABLE,\@object
163 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
164 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
165 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
166 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
167 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
168 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
169 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
170 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
171 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
172 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
173 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
174 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
175 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
176 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
177 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
178 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
179 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
180 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
181 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
182 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
183 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
184 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
185 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
186 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
187 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
188 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
189 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
190 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
191 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
192 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
193 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
194 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
196 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
197 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
198 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
199 .long 0,0,0,0, 0,0,0,0
200 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
204 ######################################################################
208 ($iv,$inout,$roundkey,$temp,
209 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
213 ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
214 ## &vmovdqu ($inout,($inp));
215 ## &mov ($_inp,$inp);
217 '&vpxor ($inout,$inout,$roundkey);'.
218 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
220 '&vpxor ($inout,$inout,$iv);',
222 '&vaesenc ($inout,$inout,$roundkey);'.
223 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
225 '&vaesenc ($inout,$inout,$roundkey);'.
226 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
228 '&vaesenc ($inout,$inout,$roundkey);'.
229 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
231 '&vaesenc ($inout,$inout,$roundkey);'.
232 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
234 '&vaesenc ($inout,$inout,$roundkey);'.
235 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
237 '&vaesenc ($inout,$inout,$roundkey);'.
238 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
240 '&vaesenc ($inout,$inout,$roundkey);'.
241 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
243 '&vaesenc ($inout,$inout,$roundkey);'.
244 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
246 '&vaesenc ($inout,$inout,$roundkey);'.
247 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
249 '&vaesenclast ($temp,$inout,$roundkey);'.
250 ' &vaesenc ($inout,$inout,$roundkey);'.
251 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
253 '&vpand ($iv,$temp,$mask10);'.
254 ' &vaesenc ($inout,$inout,$roundkey);'.
255 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
257 '&vaesenclast ($temp,$inout,$roundkey);'.
258 ' &vaesenc ($inout,$inout,$roundkey);'.
259 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
261 '&vpand ($temp,$temp,$mask12);'.
262 ' &vaesenc ($inout,$inout,$roundkey);'.
263 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
265 '&vpor ($iv,$iv,$temp);'.
266 ' &vaesenclast ($temp,$inout,$roundkey);'.
267 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
269 ## &mov ($inp,$_inp);
270 ## &mov ($out,$_out);
271 ## &vpand ($temp,$temp,$mask14);
272 ## &vpor ($iv,$iv,$temp);
273 ## &vmovdqu ($iv,($out,$inp);
274 ## &lea (inp,16($inp));
278 my ($a,$b,$c,$d,$e,$f,$g,$h);
280 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
281 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
283 $arg = "\$$arg" if ($arg*1 eq $arg);
284 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
289 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
291 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
296 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
297 '&xor ($a4,$g)', # f^g
299 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
301 '&and ($a4,$e)', # (f^g)&e
303 @aesni_cbc_block[$aesni_cbc_idx++].
305 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
308 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
309 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
310 '&xor ($a2,$b)', # a^b, b^c in next round
312 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
313 '&add ($h,$a4)', # h+=Ch(e,f,g)
314 '&and ($a3,$a2)', # (b^c)&(a^b)
317 '&add ($h,$a0)', # h+=Sigma1(e)
318 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
320 '&add ($d,$h)', # d+=h
321 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
322 '&add ($h,$a3)', # h+=Maj(a,b,c)
325 '&add ($a1,$h);'. # h+=Sigma0(a)
326 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
331 ######################################################################
335 .type ${func}_xop,\@function,6
339 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
346 mov %rsp,%r11 # copy %rsp
347 sub \$`$framesz+$win64*16*10`,%rsp
348 and \$-64,%rsp # align stack frame
351 sub $inp,$out # re-bias
353 add $inp,$len # end of input
355 #mov $inp,$_inp # saved later
358 #mov $key,$_key # remains resident in $inp register
364 $code.=<<___ if ($win64);
365 movaps %xmm6,`$framesz+16*0`(%rsp)
366 movaps %xmm7,`$framesz+16*1`(%rsp)
367 movaps %xmm8,`$framesz+16*2`(%rsp)
368 movaps %xmm9,`$framesz+16*3`(%rsp)
369 movaps %xmm10,`$framesz+16*4`(%rsp)
370 movaps %xmm11,`$framesz+16*5`(%rsp)
371 movaps %xmm12,`$framesz+16*6`(%rsp)
372 movaps %xmm13,`$framesz+16*7`(%rsp)
373 movaps %xmm14,`$framesz+16*8`(%rsp)
374 movaps %xmm15,`$framesz+16*9`(%rsp)
380 mov $inp,%r12 # borrow $a4
381 lea 0x80($key),$inp # size optimization, reassign
382 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
383 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
384 mov $ctx,%r15 # borrow $a2
385 mov $in0,%rsi # borrow $a3
386 vmovdqu ($ivp),$iv # load IV
398 vmovdqa 0x00(%r13,%r14,8),$mask14
399 vmovdqa 0x10(%r13,%r14,8),$mask12
400 vmovdqa 0x20(%r13,%r14,8),$mask10
401 vmovdqu 0x00-0x80($inp),$roundkey
404 if ($SZ==4) { # SHA256
405 my @X = map("%xmm$_",(0..3));
406 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
411 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
412 vmovdqu 0x00(%rsi,%r12),@X[0]
413 vmovdqu 0x10(%rsi,%r12),@X[1]
414 vmovdqu 0x20(%rsi,%r12),@X[2]
415 vmovdqu 0x30(%rsi,%r12),@X[3]
416 vpshufb $t3,@X[0],@X[0]
417 lea $TABLE(%rip),$Tbl
418 vpshufb $t3,@X[1],@X[1]
419 vpshufb $t3,@X[2],@X[2]
420 vpaddd 0x00($Tbl),@X[0],$t0
421 vpshufb $t3,@X[3],@X[3]
422 vpaddd 0x20($Tbl),@X[1],$t1
423 vpaddd 0x40($Tbl),@X[2],$t2
424 vpaddd 0x60($Tbl),@X[3],$t3
425 vmovdqa $t0,0x00(%rsp)
427 vmovdqa $t1,0x10(%rsp)
429 vmovdqa $t2,0x20(%rsp)
431 vmovdqa $t3,0x30(%rsp)
437 sub \$-16*2*$SZ,$Tbl # size optimization
438 vmovdqu (%r12),$inout # $a4
441 sub XOP_256_00_47 () {
445 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
447 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
450 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
453 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
456 &vpsrld ($t0,$t0,$sigma0[2]);
459 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
464 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
467 &vpxor ($t0,$t0,$t1);
472 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
475 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
478 &vpsrld ($t2,@X[3],$sigma1[2]);
481 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
484 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
487 &vpxor ($t3,$t3,$t2);
492 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
497 &vpsrldq ($t3,$t3,8);
502 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
507 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
510 &vpsrld ($t2,@X[0],$sigma1[2]);
513 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
516 &vpxor ($t3,$t3,$t2);
521 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
526 &vpslldq ($t3,$t3,8); # 22 instructions
531 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
536 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
537 foreach (@insns) { eval; } # remaining instructions
538 &vmovdqa (16*$j."(%rsp)",$t2);
542 for ($i=0,$j=0; $j<4; $j++) {
543 &XOP_256_00_47($j,\&body_00_15,@X);
544 push(@X,shift(@X)); # rotate(@X)
546 &mov ("%r12",$_inp); # borrow $a4
547 &vpand ($temp,$temp,$mask14);
548 &mov ("%r15",$_out); # borrow $a2
549 &vpor ($iv,$iv,$temp);
550 &vmovdqu ("(%r15,%r12)",$iv); # write output
551 &lea ("%r12","16(%r12)"); # inp++
553 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
554 &jne (".Lxop_00_47");
556 &vmovdqu ($inout,"(%r12)");
560 for ($i=0; $i<16; ) {
561 foreach(body_00_15()) { eval; }
565 mov $_inp,%r12 # borrow $a4
566 mov $_out,%r13 # borrow $a0
567 mov $_ctx,%r15 # borrow $a2
568 mov $_in0,%rsi # borrow $a3
570 vpand $mask14,$temp,$temp
573 vmovdqu $iv,(%r13,%r12) # write output
574 lea 16(%r12),%r12 # inp++
600 vmovdqu $iv,($ivp) # output IV
603 $code.=<<___ if ($win64);
604 movaps `$framesz+16*0`(%rsp),%xmm6
605 movaps `$framesz+16*1`(%rsp),%xmm7
606 movaps `$framesz+16*2`(%rsp),%xmm8
607 movaps `$framesz+16*3`(%rsp),%xmm9
608 movaps `$framesz+16*4`(%rsp),%xmm10
609 movaps `$framesz+16*5`(%rsp),%xmm11
610 movaps `$framesz+16*6`(%rsp),%xmm12
611 movaps `$framesz+16*7`(%rsp),%xmm13
612 movaps `$framesz+16*8`(%rsp),%xmm14
613 movaps `$framesz+16*9`(%rsp),%xmm15
625 .size ${func}_xop,.-${func}_xop
627 ######################################################################
630 local *ror = sub { &shrd(@_[0],@_) };
633 .type ${func}_avx,\@function,6
637 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
644 mov %rsp,%r11 # copy %rsp
645 sub \$`$framesz+$win64*16*10`,%rsp
646 and \$-64,%rsp # align stack frame
649 sub $inp,$out # re-bias
651 add $inp,$len # end of input
653 #mov $inp,$_inp # saved later
656 #mov $key,$_key # remains resident in $inp register
662 $code.=<<___ if ($win64);
663 movaps %xmm6,`$framesz+16*0`(%rsp)
664 movaps %xmm7,`$framesz+16*1`(%rsp)
665 movaps %xmm8,`$framesz+16*2`(%rsp)
666 movaps %xmm9,`$framesz+16*3`(%rsp)
667 movaps %xmm10,`$framesz+16*4`(%rsp)
668 movaps %xmm11,`$framesz+16*5`(%rsp)
669 movaps %xmm12,`$framesz+16*6`(%rsp)
670 movaps %xmm13,`$framesz+16*7`(%rsp)
671 movaps %xmm14,`$framesz+16*8`(%rsp)
672 movaps %xmm15,`$framesz+16*9`(%rsp)
678 mov $inp,%r12 # borrow $a4
679 lea 0x80($key),$inp # size optimization, reassign
680 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
681 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
682 mov $ctx,%r15 # borrow $a2
683 mov $in0,%rsi # borrow $a3
684 vmovdqu ($ivp),$iv # load IV
696 vmovdqa 0x00(%r13,%r14,8),$mask14
697 vmovdqa 0x10(%r13,%r14,8),$mask12
698 vmovdqa 0x20(%r13,%r14,8),$mask10
699 vmovdqu 0x00-0x80($inp),$roundkey
701 if ($SZ==4) { # SHA256
702 my @X = map("%xmm$_",(0..3));
703 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
709 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
710 vmovdqu 0x00(%rsi,%r12),@X[0]
711 vmovdqu 0x10(%rsi,%r12),@X[1]
712 vmovdqu 0x20(%rsi,%r12),@X[2]
713 vmovdqu 0x30(%rsi,%r12),@X[3]
714 vpshufb $t3,@X[0],@X[0]
715 lea $TABLE(%rip),$Tbl
716 vpshufb $t3,@X[1],@X[1]
717 vpshufb $t3,@X[2],@X[2]
718 vpaddd 0x00($Tbl),@X[0],$t0
719 vpshufb $t3,@X[3],@X[3]
720 vpaddd 0x20($Tbl),@X[1],$t1
721 vpaddd 0x40($Tbl),@X[2],$t2
722 vpaddd 0x60($Tbl),@X[3],$t3
723 vmovdqa $t0,0x00(%rsp)
725 vmovdqa $t1,0x10(%rsp)
727 vmovdqa $t2,0x20(%rsp)
729 vmovdqa $t3,0x30(%rsp)
735 sub \$-16*2*$SZ,$Tbl # size optimization
736 vmovdqu (%r12),$inout # $a4
739 sub Xupdate_256_AVX () {
741 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
742 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
743 '&vpsrld ($t2,$t0,$sigma0[0]);',
744 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
745 '&vpsrld ($t3,$t0,$sigma0[2])',
746 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
747 '&vpxor ($t0,$t3,$t2)',
748 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
749 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
750 '&vpxor ($t0,$t0,$t1)',
751 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
752 '&vpxor ($t0,$t0,$t2)',
753 '&vpsrld ($t2,$t3,$sigma1[2]);',
754 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
755 '&vpsrlq ($t3,$t3,$sigma1[0]);',
756 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
757 '&vpxor ($t2,$t2,$t3);',
758 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
759 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
760 '&vpshufd ($t2,$t2,0b10000100)',
761 '&vpsrldq ($t2,$t2,8)',
762 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
763 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
764 '&vpsrld ($t2,$t3,$sigma1[2])',
765 '&vpsrlq ($t3,$t3,$sigma1[0])',
766 '&vpxor ($t2,$t2,$t3);',
767 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
768 '&vpxor ($t2,$t2,$t3)',
769 '&vpshufd ($t2,$t2,0b11101000)',
770 '&vpslldq ($t2,$t2,8)',
771 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
775 sub AVX_256_00_47 () {
779 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
781 foreach (Xupdate_256_AVX()) { # 29 instructions
787 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
788 foreach (@insns) { eval; } # remaining instructions
789 &vmovdqa (16*$j."(%rsp)",$t2);
793 for ($i=0,$j=0; $j<4; $j++) {
794 &AVX_256_00_47($j,\&body_00_15,@X);
795 push(@X,shift(@X)); # rotate(@X)
797 &mov ("%r12",$_inp); # borrow $a4
798 &vpand ($temp,$temp,$mask14);
799 &mov ("%r15",$_out); # borrow $a2
800 &vpor ($iv,$iv,$temp);
801 &vmovdqu ("(%r15,%r12)",$iv); # write output
802 &lea ("%r12","16(%r12)"); # inp++
804 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
805 &jne (".Lavx_00_47");
807 &vmovdqu ($inout,"(%r12)");
811 for ($i=0; $i<16; ) {
812 foreach(body_00_15()) { eval; }
817 mov $_inp,%r12 # borrow $a4
818 mov $_out,%r13 # borrow $a0
819 mov $_ctx,%r15 # borrow $a2
820 mov $_in0,%rsi # borrow $a3
822 vpand $mask14,$temp,$temp
825 vmovdqu $iv,(%r13,%r12) # write output
826 lea 16(%r12),%r12 # inp++
851 vmovdqu $iv,($ivp) # output IV
854 $code.=<<___ if ($win64);
855 movaps `$framesz+16*0`(%rsp),%xmm6
856 movaps `$framesz+16*1`(%rsp),%xmm7
857 movaps `$framesz+16*2`(%rsp),%xmm8
858 movaps `$framesz+16*3`(%rsp),%xmm9
859 movaps `$framesz+16*4`(%rsp),%xmm10
860 movaps `$framesz+16*5`(%rsp),%xmm11
861 movaps `$framesz+16*6`(%rsp),%xmm12
862 movaps `$framesz+16*7`(%rsp),%xmm13
863 movaps `$framesz+16*8`(%rsp),%xmm14
864 movaps `$framesz+16*9`(%rsp),%xmm15
876 .size ${func}_avx,.-${func}_avx
880 ######################################################################
883 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
888 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
890 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
892 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
893 '&and ($a4,$e)', # f&e
894 '&rorx ($a0,$e,$Sigma1[2])',
895 '&rorx ($a2,$e,$Sigma1[1])',
897 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
898 '&lea ($h,"($h,$a4)")',
899 '&andn ($a4,$e,$g)', # ~e&g
902 '&rorx ($a1,$e,$Sigma1[0])',
903 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
904 '&xor ($a0,$a1)', # Sigma1(e)
907 '&rorx ($a4,$a,$Sigma0[2])',
908 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
909 '&xor ($a2,$b)', # a^b, b^c in next round
910 '&rorx ($a1,$a,$Sigma0[1])',
912 '&rorx ($a0,$a,$Sigma0[0])',
913 '&lea ($d,"($d,$h)")', # d+=h
914 '&and ($a3,$a2)', # (b^c)&(a^b)
915 @aesni_cbc_block[$aesni_cbc_idx++].
918 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
919 '&xor ($a1,$a0)', # Sigma0(a)
920 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
921 '&mov ($a4,$e)', # copy of f in future
923 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
925 # and at the finish one has to $a+=$a1
929 .type ${func}_avx2,\@function,6
933 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
940 mov %rsp,%r11 # copy %rsp
941 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
942 and \$-256*$SZ,%rsp # align stack frame
943 add \$`2*$SZ*($rounds-8)`,%rsp
946 sub $inp,$out # re-bias
948 add $inp,$len # end of input
950 #mov $inp,$_inp # saved later
951 #mov $out,$_out # kept in $offload
953 #mov $key,$_key # remains resident in $inp register
959 $code.=<<___ if ($win64);
960 movaps %xmm6,`$framesz+16*0`(%rsp)
961 movaps %xmm7,`$framesz+16*1`(%rsp)
962 movaps %xmm8,`$framesz+16*2`(%rsp)
963 movaps %xmm9,`$framesz+16*3`(%rsp)
964 movaps %xmm10,`$framesz+16*4`(%rsp)
965 movaps %xmm11,`$framesz+16*5`(%rsp)
966 movaps %xmm12,`$framesz+16*6`(%rsp)
967 movaps %xmm13,`$framesz+16*7`(%rsp)
968 movaps %xmm14,`$framesz+16*8`(%rsp)
969 movaps %xmm15,`$framesz+16*9`(%rsp)
975 mov $inp,%r13 # borrow $a0
976 vpinsrq \$1,$out,$offload,$offload
977 lea 0x80($key),$inp # size optimization, reassign
978 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
979 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
980 mov $ctx,%r15 # borrow $a2
981 mov $in0,%rsi # borrow $a3
982 vmovdqu ($ivp),$iv # load IV
985 vmovdqa 0x00(%r12,%r14,8),$mask14
986 vmovdqa 0x10(%r12,%r14,8),$mask12
987 vmovdqa 0x20(%r12,%r14,8),$mask10
989 sub \$-16*$SZ,%r13 # inp++, size optimization
991 lea (%rsi,%r13),%r12 # borrow $a0
993 cmp $len,%r13 # $_end
995 cmove %rsp,%r12 # next block or random data
1001 vmovdqu 0x00-0x80($inp),$roundkey
1003 if ($SZ==4) { # SHA256
1004 my @X = map("%ymm$_",(0..3));
1005 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1011 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1012 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1013 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1014 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1015 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1017 vinserti128 \$1,(%r12),@X[0],@X[0]
1018 vinserti128 \$1,16(%r12),@X[1],@X[1]
1019 vpshufb $t3,@X[0],@X[0]
1020 vinserti128 \$1,32(%r12),@X[2],@X[2]
1021 vpshufb $t3,@X[1],@X[1]
1022 vinserti128 \$1,48(%r12),@X[3],@X[3]
1024 lea $TABLE(%rip),$Tbl
1025 vpshufb $t3,@X[2],@X[2]
1026 lea -16*$SZ(%r13),%r13
1027 vpaddd 0x00($Tbl),@X[0],$t0
1028 vpshufb $t3,@X[3],@X[3]
1029 vpaddd 0x20($Tbl),@X[1],$t1
1030 vpaddd 0x40($Tbl),@X[2],$t2
1031 vpaddd 0x60($Tbl),@X[3],$t3
1032 vmovdqa $t0,0x00(%rsp)
1034 vmovdqa $t1,0x20(%rsp)
1035 lea -$PUSH8(%rsp),%rsp
1037 vmovdqa $t2,0x00(%rsp)
1039 vmovdqa $t3,0x20(%rsp)
1041 sub \$-16*2*$SZ,$Tbl # size optimization
1046 vmovdqu (%r13),$inout
1047 vpinsrq \$0,%r13,$offload,$offload
1050 sub AVX2_256_00_47 () {
1054 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1055 my $base = "+2*$PUSH8(%rsp)";
1057 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1058 foreach (Xupdate_256_AVX()) { # 29 instructions
1060 eval(shift(@insns));
1061 eval(shift(@insns));
1062 eval(shift(@insns));
1064 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1065 foreach (@insns) { eval; } # remaining instructions
1066 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1069 for ($i=0,$j=0; $j<4; $j++) {
1070 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1071 push(@X,shift(@X)); # rotate(@X)
1073 &vmovq ("%r13",$offload); # borrow $a0
1074 &vpextrq ("%r15",$offload,1); # borrow $a2
1075 &vpand ($temp,$temp,$mask14);
1076 &vpor ($iv,$iv,$temp);
1077 &vmovdqu ("(%r15,%r13)",$iv); # write output
1078 &lea ("%r13","16(%r13)"); # inp++
1080 &lea ($Tbl,16*2*$SZ."($Tbl)");
1081 &cmpb (($SZ-1)."($Tbl)",0);
1082 &jne (".Lavx2_00_47");
1084 &vmovdqu ($inout,"(%r13)");
1085 &vpinsrq ($offload,$offload,"%r13",0);
1088 for ($i=0; $i<16; ) {
1089 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1090 foreach(bodyx_00_15()) { eval; }
1094 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1095 vmovq $offload,%r13 # $_inp, borrow $a0
1096 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1098 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1100 vpand $mask14,$temp,$temp
1102 vmovdqu $iv,(%r12,%r13) # write output
1123 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1133 vmovdqu (%r13),$inout
1134 vpinsrq \$0,%r13,$offload,$offload
1137 for ($i=0; $i<16; ) {
1138 my $base="+16($Tbl)";
1139 foreach(bodyx_00_15()) { eval; }
1140 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1143 vmovq $offload,%r13 # borrow $a0
1144 vpextrq \$1,$offload,%r15 # borrow $a2
1145 vpand $mask14,$temp,$temp
1147 lea -$PUSH8($Tbl),$Tbl
1148 vmovdqu $iv,(%r15,%r13) # write output
1149 lea 16(%r13),%r13 # inp++
1153 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1154 lea 16*$SZ(%r13),%r13
1155 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1157 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1166 lea (%rsi,%r13),%r12
1172 cmove %rsp,%r12 # next block or stale data
1188 vmovdqu $iv,($ivp) # output IV
1191 $code.=<<___ if ($win64);
1192 movaps `$framesz+16*0`(%rsp),%xmm6
1193 movaps `$framesz+16*1`(%rsp),%xmm7
1194 movaps `$framesz+16*2`(%rsp),%xmm8
1195 movaps `$framesz+16*3`(%rsp),%xmm9
1196 movaps `$framesz+16*4`(%rsp),%xmm10
1197 movaps `$framesz+16*5`(%rsp),%xmm11
1198 movaps `$framesz+16*6`(%rsp),%xmm12
1199 movaps `$framesz+16*7`(%rsp),%xmm13
1200 movaps `$framesz+16*8`(%rsp),%xmm14
1201 movaps `$framesz+16*9`(%rsp),%xmm15
1213 .size ${func}_avx2,.-${func}_avx2
1218 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1220 my ($rounds,$Tbl)=("%r11d","%rbx");
1222 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1223 my @rndkey=("%xmm4","%xmm5");
1227 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1228 my @MSG=map("%xmm$_",(10..13));
1232 my ($n,$k)=($r/10,$r%10);
1235 movups `16*$n`($in0),$in # load input
1238 $code.=<<___ if ($n);
1239 movups $iv,`16*($n-1)`($out,$in0) # write output
1243 movups `32+16*$k-112`($key),$rndkey[1]
1244 aesenc $rndkey[0],$iv
1251 movups `32+16*($k+0)-112`($key),$rndkey[1]
1252 aesenc $rndkey[0],$iv
1253 movups `32+16*($k+1)-112`($key),$rndkey[0]
1254 aesenc $rndkey[1],$iv
1256 movups `32+16*($k+2)-112`($key),$rndkey[1]
1257 aesenc $rndkey[0],$iv
1258 movups `32+16*($k+3)-112`($key),$rndkey[0]
1259 aesenc $rndkey[1],$iv
1261 aesenclast $rndkey[0],$iv
1262 movups 16-112($key),$rndkey[1] # forward reference
1267 movups `32+16*$k-112`($key),$rndkey[1]
1268 aesenc $rndkey[0],$iv
1271 $r++; unshift(@rndkey,pop(@rndkey));
1278 .type ${func}_shaext,\@function,6
1281 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
1283 $code.=<<___ if ($win64);
1284 lea `-8-10*16`(%rsp),%rsp
1285 movaps %xmm6,-8-10*16(%rax)
1286 movaps %xmm7,-8-9*16(%rax)
1287 movaps %xmm8,-8-8*16(%rax)
1288 movaps %xmm9,-8-7*16(%rax)
1289 movaps %xmm10,-8-6*16(%rax)
1290 movaps %xmm11,-8-5*16(%rax)
1291 movaps %xmm12,-8-4*16(%rax)
1292 movaps %xmm13,-8-3*16(%rax)
1293 movaps %xmm14,-8-2*16(%rax)
1294 movaps %xmm15,-8-1*16(%rax)
1298 lea K256+0x80(%rip),$Tbl
1299 movdqu ($ctx),$ABEF # DCBA
1300 movdqu 16($ctx),$CDGH # HGFE
1301 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1303 mov 240($key),$rounds
1305 movups ($key),$rndkey0 # $key[0]
1306 movups 16($key),$rndkey[0] # forward reference
1307 lea 112($key),$key # size optimization
1309 pshufd \$0x1b,$ABEF,$Wi # ABCD
1310 pshufd \$0xb1,$ABEF,$ABEF # CDAB
1311 pshufd \$0x1b,$CDGH,$CDGH # EFGH
1312 movdqa $TMP,$BSWAP # offload
1313 palignr \$8,$CDGH,$ABEF # ABEF
1314 punpcklqdq $Wi,$CDGH # CDGH
1320 movdqu ($inp),@MSG[0]
1321 movdqu 0x10($inp),@MSG[1]
1322 movdqu 0x20($inp),@MSG[2]
1324 movdqu 0x30($inp),@MSG[3]
1326 movdqa 0*32-0x80($Tbl),$Wi
1329 movdqa $CDGH,$CDGH_SAVE # offload
1330 movdqa $ABEF,$ABEF_SAVE # offload
1334 sha256rnds2 $ABEF,$CDGH # 0-3
1335 pshufd \$0x0e,$Wi,$Wi
1339 sha256rnds2 $CDGH,$ABEF
1341 movdqa 1*32-0x80($Tbl),$Wi
1348 sha256rnds2 $ABEF,$CDGH # 4-7
1349 pshufd \$0x0e,$Wi,$Wi
1353 sha256rnds2 $CDGH,$ABEF
1355 movdqa 2*32-0x80($Tbl),$Wi
1358 sha256msg1 @MSG[1],@MSG[0]
1362 sha256rnds2 $ABEF,$CDGH # 8-11
1363 pshufd \$0x0e,$Wi,$Wi
1365 palignr \$4,@MSG[2],$TMP
1370 sha256rnds2 $CDGH,$ABEF
1372 movdqa 3*32-0x80($Tbl),$Wi
1374 sha256msg2 @MSG[3],@MSG[0]
1375 sha256msg1 @MSG[2],@MSG[1]
1379 sha256rnds2 $ABEF,$CDGH # 12-15
1380 pshufd \$0x0e,$Wi,$Wi
1385 palignr \$4,@MSG[3],$TMP
1387 sha256rnds2 $CDGH,$ABEF
1389 for($i=4;$i<16-3;$i++) {
1390 &$aesenc() if (($r%10)==0);
1392 movdqa $i*32-0x80($Tbl),$Wi
1394 sha256msg2 @MSG[0],@MSG[1]
1395 sha256msg1 @MSG[3],@MSG[2]
1399 sha256rnds2 $ABEF,$CDGH # 16-19...
1400 pshufd \$0x0e,$Wi,$Wi
1402 palignr \$4,@MSG[0],$TMP
1406 &$aesenc() if ($r==19);
1408 sha256rnds2 $CDGH,$ABEF
1410 push(@MSG,shift(@MSG));
1413 movdqa 13*32-0x80($Tbl),$Wi
1415 sha256msg2 @MSG[0],@MSG[1]
1416 sha256msg1 @MSG[3],@MSG[2]
1420 sha256rnds2 $ABEF,$CDGH # 52-55
1421 pshufd \$0x0e,$Wi,$Wi
1423 palignr \$4,@MSG[0],$TMP
1429 sha256rnds2 $CDGH,$ABEF
1431 movdqa 14*32-0x80($Tbl),$Wi
1433 sha256msg2 @MSG[1],@MSG[2]
1438 sha256rnds2 $ABEF,$CDGH # 56-59
1439 pshufd \$0x0e,$Wi,$Wi
1443 sha256rnds2 $CDGH,$ABEF
1445 movdqa 15*32-0x80($Tbl),$Wi
1451 sha256rnds2 $ABEF,$CDGH # 60-63
1452 pshufd \$0x0e,$Wi,$Wi
1456 sha256rnds2 $CDGH,$ABEF
1457 #pxor $CDGH,$rndkey0 # black magic
1459 while ($r<40) { &$aesenc(); } # remaining aesenc's
1461 #xorps $CDGH,$rndkey0 # black magic
1462 paddd $CDGH_SAVE,$CDGH
1463 paddd $ABEF_SAVE,$ABEF
1466 movups $iv,48($out,$in0) # write output
1470 pshufd \$0xb1,$CDGH,$CDGH # DCHG
1471 pshufd \$0x1b,$ABEF,$TMP # FEBA
1472 pshufd \$0xb1,$ABEF,$ABEF # BAFE
1473 punpckhqdq $CDGH,$ABEF # DCBA
1474 palignr \$8,$TMP,$CDGH # HGFE
1476 movups $iv,($ivp) # write IV
1478 movdqu $CDGH,16($ctx)
1480 $code.=<<___ if ($win64);
1481 movaps 0*16(%rsp),%xmm6
1482 movaps 1*16(%rsp),%xmm7
1483 movaps 2*16(%rsp),%xmm8
1484 movaps 3*16(%rsp),%xmm9
1485 movaps 4*16(%rsp),%xmm10
1486 movaps 5*16(%rsp),%xmm11
1487 movaps 6*16(%rsp),%xmm12
1488 movaps 7*16(%rsp),%xmm13
1489 movaps 8*16(%rsp),%xmm14
1490 movaps 9*16(%rsp),%xmm15
1491 lea 8+10*16(%rsp),%rsp
1496 .size ${func}_shaext,.-${func}_shaext
1501 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1502 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1503 if ($win64 && $avx) {
1510 .extern __imp_RtlVirtualUnwind
1511 .type se_handler,\@abi-omnipotent
1525 mov 120($context),%rax # pull context->Rax
1526 mov 248($context),%rbx # pull context->Rip
1528 mov 8($disp),%rsi # disp->ImageBase
1529 mov 56($disp),%r11 # disp->HanderlData
1531 mov 0(%r11),%r10d # HandlerData[0]
1532 lea (%rsi,%r10),%r10 # prologue label
1533 cmp %r10,%rbx # context->Rip<prologue label
1536 mov 152($context),%rax # pull context->Rsp
1538 mov 4(%r11),%r10d # HandlerData[1]
1539 lea (%rsi,%r10),%r10 # epilogue label
1540 cmp %r10,%rbx # context->Rip>=epilogue label
1543 $code.=<<___ if ($shaext);
1544 lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1549 lea 512($context),%rdi # &context.Xmm6
1551 .long 0xa548f3fc # cld; rep movsq
1552 lea 168(%rax),%rax # adjust stack pointer
1556 $code.=<<___ if ($avx>1);
1557 lea .Lavx2_shortcut(%rip),%r10
1558 cmp %r10,%rbx # context->Rip<avx2_shortcut
1562 add \$`2*$SZ*($rounds-8)`,%rax
1566 mov %rax,%rsi # put aside Rsp
1567 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1576 mov %rbx,144($context) # restore context->Rbx
1577 mov %rbp,160($context) # restore context->Rbp
1578 mov %r12,216($context) # restore context->R12
1579 mov %r13,224($context) # restore context->R13
1580 mov %r14,232($context) # restore context->R14
1581 mov %r15,240($context) # restore context->R15
1583 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1584 lea 512($context),%rdi # &context.Xmm6
1586 .long 0xa548f3fc # cld; rep movsq
1591 mov %rax,152($context) # restore context->Rsp
1592 mov %rsi,168($context) # restore context->Rsi
1593 mov %rdi,176($context) # restore context->Rdi
1595 mov 40($disp),%rdi # disp->ContextRecord
1596 mov $context,%rsi # context
1597 mov \$154,%ecx # sizeof(CONTEXT)
1598 .long 0xa548f3fc # cld; rep movsq
1601 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1602 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1603 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1604 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1605 mov 40(%rsi),%r10 # disp->ContextRecord
1606 lea 56(%rsi),%r11 # &disp->HandlerData
1607 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1608 mov %r10,32(%rsp) # arg5
1609 mov %r11,40(%rsp) # arg6
1610 mov %r12,48(%rsp) # arg7
1611 mov %rcx,56(%rsp) # arg8, (NULL)
1612 call *__imp_RtlVirtualUnwind(%rip)
1614 mov \$1,%eax # ExceptionContinueSearch
1626 .size se_handler,.-se_handler
1629 .rva .LSEH_begin_${func}_xop
1630 .rva .LSEH_end_${func}_xop
1631 .rva .LSEH_info_${func}_xop
1633 .rva .LSEH_begin_${func}_avx
1634 .rva .LSEH_end_${func}_avx
1635 .rva .LSEH_info_${func}_avx
1637 $code.=<<___ if ($avx>1);
1638 .rva .LSEH_begin_${func}_avx2
1639 .rva .LSEH_end_${func}_avx2
1640 .rva .LSEH_info_${func}_avx2
1642 $code.=<<___ if ($shaext);
1643 .rva .LSEH_begin_${func}_shaext
1644 .rva .LSEH_end_${func}_shaext
1645 .rva .LSEH_info_${func}_shaext
1650 .LSEH_info_${func}_xop:
1653 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1655 .LSEH_info_${func}_avx:
1658 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1660 $code.=<<___ if ($avx>1);
1661 .LSEH_info_${func}_avx2:
1664 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1666 $code.=<<___ if ($shaext);
1667 .LSEH_info_${func}_shaext:
1670 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1674 ####################################################################
1676 local *opcode=shift;
1680 $rex|=0x04 if($dst>=8);
1681 $rex|=0x01 if($src>=8);
1682 unshift @opcode,$rex|0x40 if($rex);
1687 "sha256rnds2" => 0xcb,
1688 "sha256msg1" => 0xcc,
1689 "sha256msg2" => 0xcd );
1694 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1695 my @opcode=(0x0f,0x38);
1696 rex(\@opcode,$2,$1);
1697 push @opcode,$opcodelet{$instr};
1698 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1699 return ".byte\t".join(',',@opcode);
1701 return $instr."\t".@_[0];
1706 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1707 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;