3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
15 # parallelism, interleaving it with another algorithm would allow to
16 # utilize processor resources better and achieve better performance.
17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18 # AESNI code is weaved into it. As SHA256 dominates execution time,
19 # stitch performance does not depend on AES key length. Below are
20 # performance numbers in cycles per processed byte, less is better,
21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
24 # AES-128/-192/-256+SHA256 this(**)gain
25 # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
26 # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
27 # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
28 # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
30 # (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
31 # Westmere is omitted from loop, this is because gain was not
32 # estimated high enough to justify the effort;
33 # (**) these are EVP-free results, results obtained with 'speed
34 # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
38 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
40 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
44 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
45 die "can't locate x86_64-xlate.pl";
47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49 $avx = ($1>=2.19) + ($1>=2.22);
52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54 $avx = ($1>=2.09) + ($1>=2.10);
57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59 $avx = ($1>=10) + ($1>=11);
62 if (!$avx && `$ENV{CC} -v` =~ /LLVM ([3-9]\.[0-9]+)/) {
63 $avx = ($1>=3.0) + ($1>=3.1);
66 $shaext=$avx; ### set to zero if compiling for 1.0.1
67 $avx=1 if (!$shaext && $avx);
69 open OUT,"| \"$^X\" $xlate $flavour $output";
72 $func="aesni_cbc_sha256_enc";
75 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
76 "%r8d","%r9d","%r10d","%r11d");
77 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
84 ########################################################################
85 # void aesni_cbc_sha256_enc(const void *inp,
92 ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
93 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
97 $_inp="16*$SZ+0*8(%rsp)";
98 $_out="16*$SZ+1*8(%rsp)";
99 $_end="16*$SZ+2*8(%rsp)";
100 $_key="16*$SZ+3*8(%rsp)";
101 $_ivp="16*$SZ+4*8(%rsp)";
102 $_ctx="16*$SZ+5*8(%rsp)";
103 $_in0="16*$SZ+6*8(%rsp)";
104 $_rsp="16*$SZ+7*8(%rsp)";
110 .extern OPENSSL_ia32cap_P
112 .type $func,\@abi-omnipotent
118 lea OPENSSL_ia32cap_P(%rip),%r11
120 cmp \$0,`$win64?"%rcx":"%rdi"`
125 $code.=<<___ if ($shaext);
126 bt \$61,%r10 # check for SHA
133 test \$`1<<11`,%r10d # check for XOP
136 $code.=<<___ if ($avx>1);
137 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
138 cmp \$`1<<8|1<<5|1<<3`,%r11d
142 and \$`1<<30`,%eax # mask "Intel CPU" bit
143 and \$`1<<28|1<<9`,%r10d # mask AVX+SSSE3 bits
145 cmp \$`1<<28|1<<9|1<<30`,%r10d
152 cmp \$0,`$win64?"%rcx":"%rdi"`
160 .type $TABLE,\@object
162 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
163 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
164 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
165 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
166 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
167 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
168 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
169 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
170 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
171 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
172 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
173 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
174 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
175 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
176 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
177 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
178 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
179 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
180 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
181 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
182 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
183 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
184 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
185 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
186 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
187 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
188 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
189 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
190 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
191 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
192 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
193 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
195 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
196 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
197 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
198 .long 0,0,0,0, 0,0,0,0
199 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
203 ######################################################################
207 ($iv,$inout,$roundkey,$temp,
208 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
212 ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
213 ## &vmovdqu ($inout,($inp));
214 ## &mov ($_inp,$inp);
216 '&vpxor ($inout,$inout,$roundkey);'.
217 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
219 '&vpxor ($inout,$inout,$iv);',
221 '&vaesenc ($inout,$inout,$roundkey);'.
222 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
224 '&vaesenc ($inout,$inout,$roundkey);'.
225 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
227 '&vaesenc ($inout,$inout,$roundkey);'.
228 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
230 '&vaesenc ($inout,$inout,$roundkey);'.
231 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
233 '&vaesenc ($inout,$inout,$roundkey);'.
234 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
236 '&vaesenc ($inout,$inout,$roundkey);'.
237 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
239 '&vaesenc ($inout,$inout,$roundkey);'.
240 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
242 '&vaesenc ($inout,$inout,$roundkey);'.
243 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
245 '&vaesenc ($inout,$inout,$roundkey);'.
246 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
248 '&vaesenclast ($temp,$inout,$roundkey);'.
249 ' &vaesenc ($inout,$inout,$roundkey);'.
250 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
252 '&vpand ($iv,$temp,$mask10);'.
253 ' &vaesenc ($inout,$inout,$roundkey);'.
254 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
256 '&vaesenclast ($temp,$inout,$roundkey);'.
257 ' &vaesenc ($inout,$inout,$roundkey);'.
258 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
260 '&vpand ($temp,$temp,$mask12);'.
261 ' &vaesenc ($inout,$inout,$roundkey);'.
262 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
264 '&vpor ($iv,$iv,$temp);'.
265 ' &vaesenclast ($temp,$inout,$roundkey);'.
266 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
268 ## &mov ($inp,$_inp);
269 ## &mov ($out,$_out);
270 ## &vpand ($temp,$temp,$mask14);
271 ## &vpor ($iv,$iv,$temp);
272 ## &vmovdqu ($iv,($out,$inp);
273 ## &lea (inp,16($inp));
277 my ($a,$b,$c,$d,$e,$f,$g,$h);
279 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
280 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
282 $arg = "\$$arg" if ($arg*1 eq $arg);
283 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
288 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
290 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
295 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
296 '&xor ($a4,$g)', # f^g
298 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
300 '&and ($a4,$e)', # (f^g)&e
302 @aesni_cbc_block[$aesni_cbc_idx++].
304 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
307 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
308 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
309 '&xor ($a2,$b)', # a^b, b^c in next round
311 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
312 '&add ($h,$a4)', # h+=Ch(e,f,g)
313 '&and ($a3,$a2)', # (b^c)&(a^b)
316 '&add ($h,$a0)', # h+=Sigma1(e)
317 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
319 '&add ($d,$h)', # d+=h
320 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
321 '&add ($h,$a3)', # h+=Maj(a,b,c)
324 '&add ($a1,$h);'. # h+=Sigma0(a)
325 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
330 ######################################################################
334 .type ${func}_xop,\@function,6
338 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
345 mov %rsp,%r11 # copy %rsp
346 sub \$`$framesz+$win64*16*10`,%rsp
347 and \$-64,%rsp # align stack frame
350 sub $inp,$out # re-bias
352 add $inp,$len # end of input
354 #mov $inp,$_inp # saved later
357 #mov $key,$_key # remains resident in $inp register
363 $code.=<<___ if ($win64);
364 movaps %xmm6,`$framesz+16*0`(%rsp)
365 movaps %xmm7,`$framesz+16*1`(%rsp)
366 movaps %xmm8,`$framesz+16*2`(%rsp)
367 movaps %xmm9,`$framesz+16*3`(%rsp)
368 movaps %xmm10,`$framesz+16*4`(%rsp)
369 movaps %xmm11,`$framesz+16*5`(%rsp)
370 movaps %xmm12,`$framesz+16*6`(%rsp)
371 movaps %xmm13,`$framesz+16*7`(%rsp)
372 movaps %xmm14,`$framesz+16*8`(%rsp)
373 movaps %xmm15,`$framesz+16*9`(%rsp)
379 mov $inp,%r12 # borrow $a4
380 lea 0x80($key),$inp # size optimization, reassign
381 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
382 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
383 mov $ctx,%r15 # borrow $a2
384 mov $in0,%rsi # borrow $a3
385 vmovdqu ($ivp),$iv # load IV
397 vmovdqa 0x00(%r13,%r14,8),$mask14
398 vmovdqa 0x10(%r13,%r14,8),$mask12
399 vmovdqa 0x20(%r13,%r14,8),$mask10
400 vmovdqu 0x00-0x80($inp),$roundkey
403 if ($SZ==4) { # SHA256
404 my @X = map("%xmm$_",(0..3));
405 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
410 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
411 vmovdqu 0x00(%rsi,%r12),@X[0]
412 vmovdqu 0x10(%rsi,%r12),@X[1]
413 vmovdqu 0x20(%rsi,%r12),@X[2]
414 vmovdqu 0x30(%rsi,%r12),@X[3]
415 vpshufb $t3,@X[0],@X[0]
416 lea $TABLE(%rip),$Tbl
417 vpshufb $t3,@X[1],@X[1]
418 vpshufb $t3,@X[2],@X[2]
419 vpaddd 0x00($Tbl),@X[0],$t0
420 vpshufb $t3,@X[3],@X[3]
421 vpaddd 0x20($Tbl),@X[1],$t1
422 vpaddd 0x40($Tbl),@X[2],$t2
423 vpaddd 0x60($Tbl),@X[3],$t3
424 vmovdqa $t0,0x00(%rsp)
426 vmovdqa $t1,0x10(%rsp)
428 vmovdqa $t2,0x20(%rsp)
430 vmovdqa $t3,0x30(%rsp)
436 sub \$-16*2*$SZ,$Tbl # size optimization
437 vmovdqu (%r12),$inout # $a4
440 sub XOP_256_00_47 () {
444 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
446 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
449 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
452 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
455 &vpsrld ($t0,$t0,$sigma0[2]);
458 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
463 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
466 &vpxor ($t0,$t0,$t1);
471 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
474 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
477 &vpsrld ($t2,@X[3],$sigma1[2]);
480 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
483 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
486 &vpxor ($t3,$t3,$t2);
491 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
496 &vpsrldq ($t3,$t3,8);
501 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
506 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
509 &vpsrld ($t2,@X[0],$sigma1[2]);
512 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
515 &vpxor ($t3,$t3,$t2);
520 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
525 &vpslldq ($t3,$t3,8); # 22 instructions
530 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
535 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
536 foreach (@insns) { eval; } # remaining instructions
537 &vmovdqa (16*$j."(%rsp)",$t2);
541 for ($i=0,$j=0; $j<4; $j++) {
542 &XOP_256_00_47($j,\&body_00_15,@X);
543 push(@X,shift(@X)); # rotate(@X)
545 &mov ("%r12",$_inp); # borrow $a4
546 &vpand ($temp,$temp,$mask14);
547 &mov ("%r15",$_out); # borrow $a2
548 &vpor ($iv,$iv,$temp);
549 &vmovdqu ("(%r15,%r12)",$iv); # write output
550 &lea ("%r12","16(%r12)"); # inp++
552 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
553 &jne (".Lxop_00_47");
555 &vmovdqu ($inout,"(%r12)");
559 for ($i=0; $i<16; ) {
560 foreach(body_00_15()) { eval; }
564 mov $_inp,%r12 # borrow $a4
565 mov $_out,%r13 # borrow $a0
566 mov $_ctx,%r15 # borrow $a2
567 mov $_in0,%rsi # borrow $a3
569 vpand $mask14,$temp,$temp
572 vmovdqu $iv,(%r13,%r12) # write output
573 lea 16(%r12),%r12 # inp++
599 vmovdqu $iv,($ivp) # output IV
602 $code.=<<___ if ($win64);
603 movaps `$framesz+16*0`(%rsp),%xmm6
604 movaps `$framesz+16*1`(%rsp),%xmm7
605 movaps `$framesz+16*2`(%rsp),%xmm8
606 movaps `$framesz+16*3`(%rsp),%xmm9
607 movaps `$framesz+16*4`(%rsp),%xmm10
608 movaps `$framesz+16*5`(%rsp),%xmm11
609 movaps `$framesz+16*6`(%rsp),%xmm12
610 movaps `$framesz+16*7`(%rsp),%xmm13
611 movaps `$framesz+16*8`(%rsp),%xmm14
612 movaps `$framesz+16*9`(%rsp),%xmm15
624 .size ${func}_xop,.-${func}_xop
626 ######################################################################
629 local *ror = sub { &shrd(@_[0],@_) };
632 .type ${func}_avx,\@function,6
636 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
643 mov %rsp,%r11 # copy %rsp
644 sub \$`$framesz+$win64*16*10`,%rsp
645 and \$-64,%rsp # align stack frame
648 sub $inp,$out # re-bias
650 add $inp,$len # end of input
652 #mov $inp,$_inp # saved later
655 #mov $key,$_key # remains resident in $inp register
661 $code.=<<___ if ($win64);
662 movaps %xmm6,`$framesz+16*0`(%rsp)
663 movaps %xmm7,`$framesz+16*1`(%rsp)
664 movaps %xmm8,`$framesz+16*2`(%rsp)
665 movaps %xmm9,`$framesz+16*3`(%rsp)
666 movaps %xmm10,`$framesz+16*4`(%rsp)
667 movaps %xmm11,`$framesz+16*5`(%rsp)
668 movaps %xmm12,`$framesz+16*6`(%rsp)
669 movaps %xmm13,`$framesz+16*7`(%rsp)
670 movaps %xmm14,`$framesz+16*8`(%rsp)
671 movaps %xmm15,`$framesz+16*9`(%rsp)
677 mov $inp,%r12 # borrow $a4
678 lea 0x80($key),$inp # size optimization, reassign
679 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
680 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
681 mov $ctx,%r15 # borrow $a2
682 mov $in0,%rsi # borrow $a3
683 vmovdqu ($ivp),$iv # load IV
695 vmovdqa 0x00(%r13,%r14,8),$mask14
696 vmovdqa 0x10(%r13,%r14,8),$mask12
697 vmovdqa 0x20(%r13,%r14,8),$mask10
698 vmovdqu 0x00-0x80($inp),$roundkey
700 if ($SZ==4) { # SHA256
701 my @X = map("%xmm$_",(0..3));
702 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
708 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
709 vmovdqu 0x00(%rsi,%r12),@X[0]
710 vmovdqu 0x10(%rsi,%r12),@X[1]
711 vmovdqu 0x20(%rsi,%r12),@X[2]
712 vmovdqu 0x30(%rsi,%r12),@X[3]
713 vpshufb $t3,@X[0],@X[0]
714 lea $TABLE(%rip),$Tbl
715 vpshufb $t3,@X[1],@X[1]
716 vpshufb $t3,@X[2],@X[2]
717 vpaddd 0x00($Tbl),@X[0],$t0
718 vpshufb $t3,@X[3],@X[3]
719 vpaddd 0x20($Tbl),@X[1],$t1
720 vpaddd 0x40($Tbl),@X[2],$t2
721 vpaddd 0x60($Tbl),@X[3],$t3
722 vmovdqa $t0,0x00(%rsp)
724 vmovdqa $t1,0x10(%rsp)
726 vmovdqa $t2,0x20(%rsp)
728 vmovdqa $t3,0x30(%rsp)
734 sub \$-16*2*$SZ,$Tbl # size optimization
735 vmovdqu (%r12),$inout # $a4
738 sub Xupdate_256_AVX () {
740 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
741 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
742 '&vpsrld ($t2,$t0,$sigma0[0]);',
743 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
744 '&vpsrld ($t3,$t0,$sigma0[2])',
745 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
746 '&vpxor ($t0,$t3,$t2)',
747 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
748 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
749 '&vpxor ($t0,$t0,$t1)',
750 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
751 '&vpxor ($t0,$t0,$t2)',
752 '&vpsrld ($t2,$t3,$sigma1[2]);',
753 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
754 '&vpsrlq ($t3,$t3,$sigma1[0]);',
755 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
756 '&vpxor ($t2,$t2,$t3);',
757 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
758 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
759 '&vpshufd ($t2,$t2,0b10000100)',
760 '&vpsrldq ($t2,$t2,8)',
761 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
762 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
763 '&vpsrld ($t2,$t3,$sigma1[2])',
764 '&vpsrlq ($t3,$t3,$sigma1[0])',
765 '&vpxor ($t2,$t2,$t3);',
766 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
767 '&vpxor ($t2,$t2,$t3)',
768 '&vpshufd ($t2,$t2,0b11101000)',
769 '&vpslldq ($t2,$t2,8)',
770 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
774 sub AVX_256_00_47 () {
778 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
780 foreach (Xupdate_256_AVX()) { # 29 instructions
786 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
787 foreach (@insns) { eval; } # remaining instructions
788 &vmovdqa (16*$j."(%rsp)",$t2);
792 for ($i=0,$j=0; $j<4; $j++) {
793 &AVX_256_00_47($j,\&body_00_15,@X);
794 push(@X,shift(@X)); # rotate(@X)
796 &mov ("%r12",$_inp); # borrow $a4
797 &vpand ($temp,$temp,$mask14);
798 &mov ("%r15",$_out); # borrow $a2
799 &vpor ($iv,$iv,$temp);
800 &vmovdqu ("(%r15,%r12)",$iv); # write output
801 &lea ("%r12","16(%r12)"); # inp++
803 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
804 &jne (".Lavx_00_47");
806 &vmovdqu ($inout,"(%r12)");
810 for ($i=0; $i<16; ) {
811 foreach(body_00_15()) { eval; }
816 mov $_inp,%r12 # borrow $a4
817 mov $_out,%r13 # borrow $a0
818 mov $_ctx,%r15 # borrow $a2
819 mov $_in0,%rsi # borrow $a3
821 vpand $mask14,$temp,$temp
824 vmovdqu $iv,(%r13,%r12) # write output
825 lea 16(%r12),%r12 # inp++
850 vmovdqu $iv,($ivp) # output IV
853 $code.=<<___ if ($win64);
854 movaps `$framesz+16*0`(%rsp),%xmm6
855 movaps `$framesz+16*1`(%rsp),%xmm7
856 movaps `$framesz+16*2`(%rsp),%xmm8
857 movaps `$framesz+16*3`(%rsp),%xmm9
858 movaps `$framesz+16*4`(%rsp),%xmm10
859 movaps `$framesz+16*5`(%rsp),%xmm11
860 movaps `$framesz+16*6`(%rsp),%xmm12
861 movaps `$framesz+16*7`(%rsp),%xmm13
862 movaps `$framesz+16*8`(%rsp),%xmm14
863 movaps `$framesz+16*9`(%rsp),%xmm15
875 .size ${func}_avx,.-${func}_avx
879 ######################################################################
882 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
887 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
889 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
891 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
892 '&and ($a4,$e)', # f&e
893 '&rorx ($a0,$e,$Sigma1[2])',
894 '&rorx ($a2,$e,$Sigma1[1])',
896 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
897 '&lea ($h,"($h,$a4)")',
898 '&andn ($a4,$e,$g)', # ~e&g
901 '&rorx ($a1,$e,$Sigma1[0])',
902 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
903 '&xor ($a0,$a1)', # Sigma1(e)
906 '&rorx ($a4,$a,$Sigma0[2])',
907 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
908 '&xor ($a2,$b)', # a^b, b^c in next round
909 '&rorx ($a1,$a,$Sigma0[1])',
911 '&rorx ($a0,$a,$Sigma0[0])',
912 '&lea ($d,"($d,$h)")', # d+=h
913 '&and ($a3,$a2)', # (b^c)&(a^b)
914 @aesni_cbc_block[$aesni_cbc_idx++].
917 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
918 '&xor ($a1,$a0)', # Sigma0(a)
919 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
920 '&mov ($a4,$e)', # copy of f in future
922 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
924 # and at the finish one has to $a+=$a1
928 .type ${func}_avx2,\@function,6
932 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
939 mov %rsp,%r11 # copy %rsp
940 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
941 and \$-256*$SZ,%rsp # align stack frame
942 add \$`2*$SZ*($rounds-8)`,%rsp
945 sub $inp,$out # re-bias
947 add $inp,$len # end of input
949 #mov $inp,$_inp # saved later
950 #mov $out,$_out # kept in $offload
952 #mov $key,$_key # remains resident in $inp register
958 $code.=<<___ if ($win64);
959 movaps %xmm6,`$framesz+16*0`(%rsp)
960 movaps %xmm7,`$framesz+16*1`(%rsp)
961 movaps %xmm8,`$framesz+16*2`(%rsp)
962 movaps %xmm9,`$framesz+16*3`(%rsp)
963 movaps %xmm10,`$framesz+16*4`(%rsp)
964 movaps %xmm11,`$framesz+16*5`(%rsp)
965 movaps %xmm12,`$framesz+16*6`(%rsp)
966 movaps %xmm13,`$framesz+16*7`(%rsp)
967 movaps %xmm14,`$framesz+16*8`(%rsp)
968 movaps %xmm15,`$framesz+16*9`(%rsp)
974 mov $inp,%r13 # borrow $a0
975 vpinsrq \$1,$out,$offload,$offload
976 lea 0x80($key),$inp # size optimization, reassign
977 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
978 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
979 mov $ctx,%r15 # borrow $a2
980 mov $in0,%rsi # borrow $a3
981 vmovdqu ($ivp),$iv # load IV
984 vmovdqa 0x00(%r12,%r14,8),$mask14
985 vmovdqa 0x10(%r12,%r14,8),$mask12
986 vmovdqa 0x20(%r12,%r14,8),$mask10
988 sub \$-16*$SZ,%r13 # inp++, size optimization
990 lea (%rsi,%r13),%r12 # borrow $a0
992 cmp $len,%r13 # $_end
994 cmove %rsp,%r12 # next block or random data
1000 vmovdqu 0x00-0x80($inp),$roundkey
1002 if ($SZ==4) { # SHA256
1003 my @X = map("%ymm$_",(0..3));
1004 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1010 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1011 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1012 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1013 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1014 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1016 vinserti128 \$1,(%r12),@X[0],@X[0]
1017 vinserti128 \$1,16(%r12),@X[1],@X[1]
1018 vpshufb $t3,@X[0],@X[0]
1019 vinserti128 \$1,32(%r12),@X[2],@X[2]
1020 vpshufb $t3,@X[1],@X[1]
1021 vinserti128 \$1,48(%r12),@X[3],@X[3]
1023 lea $TABLE(%rip),$Tbl
1024 vpshufb $t3,@X[2],@X[2]
1025 lea -16*$SZ(%r13),%r13
1026 vpaddd 0x00($Tbl),@X[0],$t0
1027 vpshufb $t3,@X[3],@X[3]
1028 vpaddd 0x20($Tbl),@X[1],$t1
1029 vpaddd 0x40($Tbl),@X[2],$t2
1030 vpaddd 0x60($Tbl),@X[3],$t3
1031 vmovdqa $t0,0x00(%rsp)
1033 vmovdqa $t1,0x20(%rsp)
1034 lea -$PUSH8(%rsp),%rsp
1036 vmovdqa $t2,0x00(%rsp)
1038 vmovdqa $t3,0x20(%rsp)
1040 sub \$-16*2*$SZ,$Tbl # size optimization
1045 vmovdqu (%r13),$inout
1046 vpinsrq \$0,%r13,$offload,$offload
1049 sub AVX2_256_00_47 () {
1053 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1054 my $base = "+2*$PUSH8(%rsp)";
1056 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1057 foreach (Xupdate_256_AVX()) { # 29 instructions
1059 eval(shift(@insns));
1060 eval(shift(@insns));
1061 eval(shift(@insns));
1063 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1064 foreach (@insns) { eval; } # remaining instructions
1065 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1068 for ($i=0,$j=0; $j<4; $j++) {
1069 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1070 push(@X,shift(@X)); # rotate(@X)
1072 &vmovq ("%r13",$offload); # borrow $a0
1073 &vpextrq ("%r15",$offload,1); # borrow $a2
1074 &vpand ($temp,$temp,$mask14);
1075 &vpor ($iv,$iv,$temp);
1076 &vmovdqu ("(%r15,%r13)",$iv); # write output
1077 &lea ("%r13","16(%r13)"); # inp++
1079 &lea ($Tbl,16*2*$SZ."($Tbl)");
1080 &cmpb (($SZ-1)."($Tbl)",0);
1081 &jne (".Lavx2_00_47");
1083 &vmovdqu ($inout,"(%r13)");
1084 &vpinsrq ($offload,$offload,"%r13",0);
1087 for ($i=0; $i<16; ) {
1088 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1089 foreach(bodyx_00_15()) { eval; }
1093 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1094 vmovq $offload,%r13 # $_inp, borrow $a0
1095 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1097 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1099 vpand $mask14,$temp,$temp
1101 vmovdqu $iv,(%r12,%r13) # write output
1122 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1132 vmovdqu (%r13),$inout
1133 vpinsrq \$0,%r13,$offload,$offload
1136 for ($i=0; $i<16; ) {
1137 my $base="+16($Tbl)";
1138 foreach(bodyx_00_15()) { eval; }
1139 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1142 vmovq $offload,%r13 # borrow $a0
1143 vpextrq \$1,$offload,%r15 # borrow $a2
1144 vpand $mask14,$temp,$temp
1146 lea -$PUSH8($Tbl),$Tbl
1147 vmovdqu $iv,(%r15,%r13) # write output
1148 lea 16(%r13),%r13 # inp++
1152 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1153 lea 16*$SZ(%r13),%r13
1154 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1156 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1165 lea (%rsi,%r13),%r12
1171 cmove %rsp,%r12 # next block or stale data
1187 vmovdqu $iv,($ivp) # output IV
1190 $code.=<<___ if ($win64);
1191 movaps `$framesz+16*0`(%rsp),%xmm6
1192 movaps `$framesz+16*1`(%rsp),%xmm7
1193 movaps `$framesz+16*2`(%rsp),%xmm8
1194 movaps `$framesz+16*3`(%rsp),%xmm9
1195 movaps `$framesz+16*4`(%rsp),%xmm10
1196 movaps `$framesz+16*5`(%rsp),%xmm11
1197 movaps `$framesz+16*6`(%rsp),%xmm12
1198 movaps `$framesz+16*7`(%rsp),%xmm13
1199 movaps `$framesz+16*8`(%rsp),%xmm14
1200 movaps `$framesz+16*9`(%rsp),%xmm15
1212 .size ${func}_avx2,.-${func}_avx2
1217 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1219 my ($rounds,$Tbl)=("%r11d","%rbx");
1221 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1222 my @rndkey=("%xmm4","%xmm5");
1226 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1227 my @MSG=map("%xmm$_",(10..13));
1231 my ($n,$k)=($r/10,$r%10);
1234 movups `16*$n`($in0),$in # load input
1237 $code.=<<___ if ($n);
1238 movups $iv,`16*($n-1)`($out,$in0) # write output
1242 movups `32+16*$k-112`($key),$rndkey[1]
1243 aesenc $rndkey[0],$iv
1250 movups `32+16*($k+0)-112`($key),$rndkey[1]
1251 aesenc $rndkey[0],$iv
1252 movups `32+16*($k+1)-112`($key),$rndkey[0]
1253 aesenc $rndkey[1],$iv
1255 movups `32+16*($k+2)-112`($key),$rndkey[1]
1256 aesenc $rndkey[0],$iv
1257 movups `32+16*($k+3)-112`($key),$rndkey[0]
1258 aesenc $rndkey[1],$iv
1260 aesenclast $rndkey[0],$iv
1261 movups 16-112($key),$rndkey[1] # forward reference
1266 movups `32+16*$k-112`($key),$rndkey[1]
1267 aesenc $rndkey[0],$iv
1270 $r++; unshift(@rndkey,pop(@rndkey));
1277 .type ${func}_shaext,\@function,6
1280 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
1282 $code.=<<___ if ($win64);
1283 lea `-8-10*16`(%rsp),%rsp
1284 movaps %xmm6,-8-10*16(%rax)
1285 movaps %xmm7,-8-9*16(%rax)
1286 movaps %xmm8,-8-8*16(%rax)
1287 movaps %xmm9,-8-7*16(%rax)
1288 movaps %xmm10,-8-6*16(%rax)
1289 movaps %xmm11,-8-5*16(%rax)
1290 movaps %xmm12,-8-4*16(%rax)
1291 movaps %xmm13,-8-3*16(%rax)
1292 movaps %xmm14,-8-2*16(%rax)
1293 movaps %xmm15,-8-1*16(%rax)
1297 lea K256+0x80(%rip),$Tbl
1298 movdqu ($ctx),$ABEF # DCBA
1299 movdqu 16($ctx),$CDGH # HGFE
1300 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1302 mov 240($key),$rounds
1304 movups ($key),$rndkey0 # $key[0]
1305 movups 16($key),$rndkey[0] # forward reference
1306 lea 112($key),$key # size optimization
1308 pshufd \$0x1b,$ABEF,$Wi # ABCD
1309 pshufd \$0xb1,$ABEF,$ABEF # CDAB
1310 pshufd \$0x1b,$CDGH,$CDGH # EFGH
1311 movdqa $TMP,$BSWAP # offload
1312 palignr \$8,$CDGH,$ABEF # ABEF
1313 punpcklqdq $Wi,$CDGH # CDGH
1319 movdqu ($inp),@MSG[0]
1320 movdqu 0x10($inp),@MSG[1]
1321 movdqu 0x20($inp),@MSG[2]
1323 movdqu 0x30($inp),@MSG[3]
1325 movdqa 0*32-0x80($Tbl),$Wi
1328 movdqa $CDGH,$CDGH_SAVE # offload
1329 movdqa $ABEF,$ABEF_SAVE # offload
1333 sha256rnds2 $ABEF,$CDGH # 0-3
1334 pshufd \$0x0e,$Wi,$Wi
1338 sha256rnds2 $CDGH,$ABEF
1340 movdqa 1*32-0x80($Tbl),$Wi
1347 sha256rnds2 $ABEF,$CDGH # 4-7
1348 pshufd \$0x0e,$Wi,$Wi
1352 sha256rnds2 $CDGH,$ABEF
1354 movdqa 2*32-0x80($Tbl),$Wi
1357 sha256msg1 @MSG[1],@MSG[0]
1361 sha256rnds2 $ABEF,$CDGH # 8-11
1362 pshufd \$0x0e,$Wi,$Wi
1364 palignr \$4,@MSG[2],$TMP
1369 sha256rnds2 $CDGH,$ABEF
1371 movdqa 3*32-0x80($Tbl),$Wi
1373 sha256msg2 @MSG[3],@MSG[0]
1374 sha256msg1 @MSG[2],@MSG[1]
1378 sha256rnds2 $ABEF,$CDGH # 12-15
1379 pshufd \$0x0e,$Wi,$Wi
1384 palignr \$4,@MSG[3],$TMP
1386 sha256rnds2 $CDGH,$ABEF
1388 for($i=4;$i<16-3;$i++) {
1389 &$aesenc() if (($r%10)==0);
1391 movdqa $i*32-0x80($Tbl),$Wi
1393 sha256msg2 @MSG[0],@MSG[1]
1394 sha256msg1 @MSG[3],@MSG[2]
1398 sha256rnds2 $ABEF,$CDGH # 16-19...
1399 pshufd \$0x0e,$Wi,$Wi
1401 palignr \$4,@MSG[0],$TMP
1405 &$aesenc() if ($r==19);
1407 sha256rnds2 $CDGH,$ABEF
1409 push(@MSG,shift(@MSG));
1412 movdqa 13*32-0x80($Tbl),$Wi
1414 sha256msg2 @MSG[0],@MSG[1]
1415 sha256msg1 @MSG[3],@MSG[2]
1419 sha256rnds2 $ABEF,$CDGH # 52-55
1420 pshufd \$0x0e,$Wi,$Wi
1422 palignr \$4,@MSG[0],$TMP
1428 sha256rnds2 $CDGH,$ABEF
1430 movdqa 14*32-0x80($Tbl),$Wi
1432 sha256msg2 @MSG[1],@MSG[2]
1437 sha256rnds2 $ABEF,$CDGH # 56-59
1438 pshufd \$0x0e,$Wi,$Wi
1442 sha256rnds2 $CDGH,$ABEF
1444 movdqa 15*32-0x80($Tbl),$Wi
1450 sha256rnds2 $ABEF,$CDGH # 60-63
1451 pshufd \$0x0e,$Wi,$Wi
1455 sha256rnds2 $CDGH,$ABEF
1456 #pxor $CDGH,$rndkey0 # black magic
1458 while ($r<40) { &$aesenc(); } # remaining aesenc's
1460 #xorps $CDGH,$rndkey0 # black magic
1461 paddd $CDGH_SAVE,$CDGH
1462 paddd $ABEF_SAVE,$ABEF
1465 movups $iv,48($out,$in0) # write output
1469 pshufd \$0xb1,$CDGH,$CDGH # DCHG
1470 pshufd \$0x1b,$ABEF,$TMP # FEBA
1471 pshufd \$0xb1,$ABEF,$ABEF # BAFE
1472 punpckhqdq $CDGH,$ABEF # DCBA
1473 palignr \$8,$TMP,$CDGH # HGFE
1475 movups $iv,($ivp) # write IV
1477 movdqu $CDGH,16($ctx)
1479 $code.=<<___ if ($win64);
1480 movaps 0*16(%rsp),%xmm6
1481 movaps 1*16(%rsp),%xmm7
1482 movaps 2*16(%rsp),%xmm8
1483 movaps 3*16(%rsp),%xmm9
1484 movaps 4*16(%rsp),%xmm10
1485 movaps 5*16(%rsp),%xmm11
1486 movaps 6*16(%rsp),%xmm12
1487 movaps 7*16(%rsp),%xmm13
1488 movaps 8*16(%rsp),%xmm14
1489 movaps 9*16(%rsp),%xmm15
1490 lea 8+10*16(%rsp),%rsp
1495 .size ${func}_shaext,.-${func}_shaext
1500 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1501 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1508 $code.=<<___ if ($avx);
1509 .extern __imp_RtlVirtualUnwind
1510 .type se_handler,\@abi-omnipotent
1524 mov 120($context),%rax # pull context->Rax
1525 mov 248($context),%rbx # pull context->Rip
1527 mov 8($disp),%rsi # disp->ImageBase
1528 mov 56($disp),%r11 # disp->HanderlData
1530 mov 0(%r11),%r10d # HandlerData[0]
1531 lea (%rsi,%r10),%r10 # prologue label
1532 cmp %r10,%rbx # context->Rip<prologue label
1535 mov 152($context),%rax # pull context->Rsp
1537 mov 4(%r11),%r10d # HandlerData[1]
1538 lea (%rsi,%r10),%r10 # epilogue label
1539 cmp %r10,%rbx # context->Rip>=epilogue label
1542 $code.=<<___ if ($shaext);
1543 lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1548 lea 512($context),%rdi # &context.Xmm6
1550 .long 0xa548f3fc # cld; rep movsq
1551 lea 168(%rax),%rax # adjust stack pointer
1555 $code.=<<___ if ($avx>1);
1556 lea .Lavx2_shortcut(%rip),%r10
1557 cmp %r10,%rbx # context->Rip<avx2_shortcut
1561 add \$`2*$SZ*($rounds-8)`,%rax
1565 mov %rax,%rsi # put aside Rsp
1566 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1575 mov %rbx,144($context) # restore context->Rbx
1576 mov %rbp,160($context) # restore context->Rbp
1577 mov %r12,216($context) # restore context->R12
1578 mov %r13,224($context) # restore context->R13
1579 mov %r14,232($context) # restore context->R14
1580 mov %r15,240($context) # restore context->R15
1582 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1583 lea 512($context),%rdi # &context.Xmm6
1585 .long 0xa548f3fc # cld; rep movsq
1590 mov %rax,152($context) # restore context->Rsp
1591 mov %rsi,168($context) # restore context->Rsi
1592 mov %rdi,176($context) # restore context->Rdi
1594 mov 40($disp),%rdi # disp->ContextRecord
1595 mov $context,%rsi # context
1596 mov \$154,%ecx # sizeof(CONTEXT)
1597 .long 0xa548f3fc # cld; rep movsq
1600 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1601 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1602 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1603 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1604 mov 40(%rsi),%r10 # disp->ContextRecord
1605 lea 56(%rsi),%r11 # &disp->HandlerData
1606 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1607 mov %r10,32(%rsp) # arg5
1608 mov %r11,40(%rsp) # arg6
1609 mov %r12,48(%rsp) # arg7
1610 mov %rcx,56(%rsp) # arg8, (NULL)
1611 call *__imp_RtlVirtualUnwind(%rip)
1613 mov \$1,%eax # ExceptionContinueSearch
1625 .size se_handler,.-se_handler
1628 .rva .LSEH_begin_${func}_xop
1629 .rva .LSEH_end_${func}_xop
1630 .rva .LSEH_info_${func}_xop
1632 .rva .LSEH_begin_${func}_avx
1633 .rva .LSEH_end_${func}_avx
1634 .rva .LSEH_info_${func}_avx
1636 $code.=<<___ if ($avx>1);
1637 .rva .LSEH_begin_${func}_avx2
1638 .rva .LSEH_end_${func}_avx2
1639 .rva .LSEH_info_${func}_avx2
1641 $code.=<<___ if ($shaext);
1642 .rva .LSEH_begin_${func}_shaext
1643 .rva .LSEH_end_${func}_shaext
1644 .rva .LSEH_info_${func}_shaext
1646 $code.=<<___ if ($avx);
1649 .LSEH_info_${func}_xop:
1652 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1654 .LSEH_info_${func}_avx:
1657 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1659 $code.=<<___ if ($avx>1);
1660 .LSEH_info_${func}_avx2:
1663 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1665 $code.=<<___ if ($shaext);
1666 .LSEH_info_${func}_shaext:
1669 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1673 ####################################################################
1675 local *opcode=shift;
1679 $rex|=0x04 if($dst>=8);
1680 $rex|=0x01 if($src>=8);
1681 unshift @opcode,$rex|0x40 if($rex);
1686 "sha256rnds2" => 0xcb,
1687 "sha256msg1" => 0xcc,
1688 "sha256msg2" => 0xcd );
1693 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1694 my @opcode=(0x0f,0x38);
1695 rex(\@opcode,$2,$1);
1696 push @opcode,$opcodelet{$instr};
1697 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1698 return ".byte\t".join(',',@opcode);
1700 return $instr."\t".@_[0];
1705 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1706 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;