3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
15 # parallelism, interleaving it with another algorithm would allow to
16 # utilize processor resources better and achieve better performance.
17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18 # AESNI code is weaved into it. As SHA256 dominates execution time,
19 # stitch performance does not depend on AES key length. Below are
20 # performance numbers in cycles per processed byte, less is better,
21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
24 # AES-128/-192/-256+SHA256 this(**)gain
25 # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
26 # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
27 # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
28 # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
30 # (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
31 # Westmere is omitted from loop, this is because gain was not
32 # estimated high enough to justify the effort;
33 # (**) these are EVP-free results, results obtained with 'speed
34 # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
38 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
40 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
44 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
45 die "can't locate x86_64-xlate.pl";
47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49 $avx = ($1>=2.19) + ($1>=2.22);
52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54 $avx = ($1>=2.09) + ($1>=2.10);
57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59 $avx = ($1>=10) + ($1>=11);
62 $shaext=1; ### set to zero if compiling for 1.0.1
63 $avx=1 if (!$shaext && $avx);
65 open OUT,"| \"$^X\" $xlate $flavour $output";
68 $func="aesni_cbc_sha256_enc";
71 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
72 "%r8d","%r9d","%r10d","%r11d");
73 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
80 ########################################################################
81 # void aesni_cbc_sha256_enc(const void *inp,
88 ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
89 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
93 $_inp="16*$SZ+0*8(%rsp)";
94 $_out="16*$SZ+1*8(%rsp)";
95 $_end="16*$SZ+2*8(%rsp)";
96 $_key="16*$SZ+3*8(%rsp)";
97 $_ivp="16*$SZ+4*8(%rsp)";
98 $_ctx="16*$SZ+5*8(%rsp)";
99 $_in0="16*$SZ+6*8(%rsp)";
100 $_rsp="16*$SZ+7*8(%rsp)";
106 .extern OPENSSL_ia32cap_P
108 .type $func,\@abi-omnipotent
112 $code.=<<___ if ($avx);
113 lea OPENSSL_ia32cap_P(%rip),%r11
115 cmp \$0,`$win64?"%rcx":"%rdi"`
120 $code.=<<___ if ($shaext);
121 bt \$61,%r10 # check for SHA
128 test \$`1<<11`,%r10d # check for XOP
131 $code.=<<___ if ($avx>1);
132 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
133 cmp \$`1<<8|1<<5|1<<3`,%r11d
136 $code.=<<___ if ($avx);
137 and \$`1<<30`,%eax # mask "Intel CPU" bit
138 and \$`1<<28|1<<9`,%r10d # mask AVX+SSSE3 bits
140 cmp \$`1<<28|1<<9|1<<30`,%r10d
146 cmp \$0,`$win64?"%rcx":"%rdi"`
154 .type $TABLE,\@object
156 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
157 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
158 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
159 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
160 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
161 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
162 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
163 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
164 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
165 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
166 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
167 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
168 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
169 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
170 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
171 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
172 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
173 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
174 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
175 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
176 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
177 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
178 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
179 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
180 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
181 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
182 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
183 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
184 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
185 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
186 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
187 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
189 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
190 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
191 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
192 .long 0,0,0,0, 0,0,0,0
193 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
197 ######################################################################
201 ($iv,$inout,$roundkey,$temp,
202 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
206 ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
207 ## &vmovdqu ($inout,($inp));
208 ## &mov ($_inp,$inp);
210 '&vpxor ($inout,$inout,$roundkey);'.
211 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
213 '&vpxor ($inout,$inout,$iv);',
215 '&vaesenc ($inout,$inout,$roundkey);'.
216 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
218 '&vaesenc ($inout,$inout,$roundkey);'.
219 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
221 '&vaesenc ($inout,$inout,$roundkey);'.
222 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
224 '&vaesenc ($inout,$inout,$roundkey);'.
225 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
227 '&vaesenc ($inout,$inout,$roundkey);'.
228 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
230 '&vaesenc ($inout,$inout,$roundkey);'.
231 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
233 '&vaesenc ($inout,$inout,$roundkey);'.
234 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
236 '&vaesenc ($inout,$inout,$roundkey);'.
237 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
239 '&vaesenc ($inout,$inout,$roundkey);'.
240 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
242 '&vaesenclast ($temp,$inout,$roundkey);'.
243 ' &vaesenc ($inout,$inout,$roundkey);'.
244 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
246 '&vpand ($iv,$temp,$mask10);'.
247 ' &vaesenc ($inout,$inout,$roundkey);'.
248 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
250 '&vaesenclast ($temp,$inout,$roundkey);'.
251 ' &vaesenc ($inout,$inout,$roundkey);'.
252 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
254 '&vpand ($temp,$temp,$mask12);'.
255 ' &vaesenc ($inout,$inout,$roundkey);'.
256 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
258 '&vpor ($iv,$iv,$temp);'.
259 ' &vaesenclast ($temp,$inout,$roundkey);'.
260 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
262 ## &mov ($inp,$_inp);
263 ## &mov ($out,$_out);
264 ## &vpand ($temp,$temp,$mask14);
265 ## &vpor ($iv,$iv,$temp);
266 ## &vmovdqu ($iv,($out,$inp);
267 ## &lea (inp,16($inp));
271 my ($a,$b,$c,$d,$e,$f,$g,$h);
273 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
274 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
276 $arg = "\$$arg" if ($arg*1 eq $arg);
277 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
282 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
284 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
289 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
290 '&xor ($a4,$g)', # f^g
292 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
294 '&and ($a4,$e)', # (f^g)&e
296 @aesni_cbc_block[$aesni_cbc_idx++].
298 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
301 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
302 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
303 '&xor ($a2,$b)', # a^b, b^c in next round
305 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
306 '&add ($h,$a4)', # h+=Ch(e,f,g)
307 '&and ($a3,$a2)', # (b^c)&(a^b)
310 '&add ($h,$a0)', # h+=Sigma1(e)
311 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
313 '&add ($d,$h)', # d+=h
314 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
315 '&add ($h,$a3)', # h+=Maj(a,b,c)
318 '&add ($a1,$h);'. # h+=Sigma0(a)
319 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
324 ######################################################################
328 .type ${func}_xop,\@function,6
332 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
339 mov %rsp,%r11 # copy %rsp
340 sub \$`$framesz+$win64*16*10`,%rsp
341 and \$-64,%rsp # align stack frame
344 sub $inp,$out # re-bias
346 add $inp,$len # end of input
348 #mov $inp,$_inp # saved later
351 #mov $key,$_key # remains resident in $inp register
357 $code.=<<___ if ($win64);
358 movaps %xmm6,`$framesz+16*0`(%rsp)
359 movaps %xmm7,`$framesz+16*1`(%rsp)
360 movaps %xmm8,`$framesz+16*2`(%rsp)
361 movaps %xmm9,`$framesz+16*3`(%rsp)
362 movaps %xmm10,`$framesz+16*4`(%rsp)
363 movaps %xmm11,`$framesz+16*5`(%rsp)
364 movaps %xmm12,`$framesz+16*6`(%rsp)
365 movaps %xmm13,`$framesz+16*7`(%rsp)
366 movaps %xmm14,`$framesz+16*8`(%rsp)
367 movaps %xmm15,`$framesz+16*9`(%rsp)
373 mov $inp,%r12 # borrow $a4
374 lea 0x80($key),$inp # size optimization, reassign
375 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
376 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
377 mov $ctx,%r15 # borrow $a2
378 mov $in0,%rsi # borrow $a3
379 vmovdqu ($ivp),$iv # load IV
391 vmovdqa 0x00(%r13,%r14,8),$mask14
392 vmovdqa 0x10(%r13,%r14,8),$mask12
393 vmovdqa 0x20(%r13,%r14,8),$mask10
394 vmovdqu 0x00-0x80($inp),$roundkey
397 if ($SZ==4) { # SHA256
398 my @X = map("%xmm$_",(0..3));
399 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
404 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
405 vmovdqu 0x00(%rsi,%r12),@X[0]
406 vmovdqu 0x10(%rsi,%r12),@X[1]
407 vmovdqu 0x20(%rsi,%r12),@X[2]
408 vmovdqu 0x30(%rsi,%r12),@X[3]
409 vpshufb $t3,@X[0],@X[0]
410 lea $TABLE(%rip),$Tbl
411 vpshufb $t3,@X[1],@X[1]
412 vpshufb $t3,@X[2],@X[2]
413 vpaddd 0x00($Tbl),@X[0],$t0
414 vpshufb $t3,@X[3],@X[3]
415 vpaddd 0x20($Tbl),@X[1],$t1
416 vpaddd 0x40($Tbl),@X[2],$t2
417 vpaddd 0x60($Tbl),@X[3],$t3
418 vmovdqa $t0,0x00(%rsp)
420 vmovdqa $t1,0x10(%rsp)
422 vmovdqa $t2,0x20(%rsp)
424 vmovdqa $t3,0x30(%rsp)
430 sub \$-16*2*$SZ,$Tbl # size optimization
431 vmovdqu (%r12),$inout # $a4
434 sub XOP_256_00_47 () {
438 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
440 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
443 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
446 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
449 &vpsrld ($t0,$t0,$sigma0[2]);
452 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
457 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
460 &vpxor ($t0,$t0,$t1);
465 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
468 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
471 &vpsrld ($t2,@X[3],$sigma1[2]);
474 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
477 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
480 &vpxor ($t3,$t3,$t2);
485 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
490 &vpsrldq ($t3,$t3,8);
495 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
500 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
503 &vpsrld ($t2,@X[0],$sigma1[2]);
506 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
509 &vpxor ($t3,$t3,$t2);
514 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
519 &vpslldq ($t3,$t3,8); # 22 instructions
524 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
529 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
530 foreach (@insns) { eval; } # remaining instructions
531 &vmovdqa (16*$j."(%rsp)",$t2);
535 for ($i=0,$j=0; $j<4; $j++) {
536 &XOP_256_00_47($j,\&body_00_15,@X);
537 push(@X,shift(@X)); # rotate(@X)
539 &mov ("%r12",$_inp); # borrow $a4
540 &vpand ($temp,$temp,$mask14);
541 &mov ("%r15",$_out); # borrow $a2
542 &vpor ($iv,$iv,$temp);
543 &vmovdqu ("(%r15,%r12)",$iv); # write output
544 &lea ("%r12","16(%r12)"); # inp++
546 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
547 &jne (".Lxop_00_47");
549 &vmovdqu ($inout,"(%r12)");
553 for ($i=0; $i<16; ) {
554 foreach(body_00_15()) { eval; }
558 mov $_inp,%r12 # borrow $a4
559 mov $_out,%r13 # borrow $a0
560 mov $_ctx,%r15 # borrow $a2
561 mov $_in0,%rsi # borrow $a3
563 vpand $mask14,$temp,$temp
566 vmovdqu $iv,(%r13,%r12) # write output
567 lea 16(%r12),%r12 # inp++
593 vmovdqu $iv,($ivp) # output IV
596 $code.=<<___ if ($win64);
597 movaps `$framesz+16*0`(%rsp),%xmm6
598 movaps `$framesz+16*1`(%rsp),%xmm7
599 movaps `$framesz+16*2`(%rsp),%xmm8
600 movaps `$framesz+16*3`(%rsp),%xmm9
601 movaps `$framesz+16*4`(%rsp),%xmm10
602 movaps `$framesz+16*5`(%rsp),%xmm11
603 movaps `$framesz+16*6`(%rsp),%xmm12
604 movaps `$framesz+16*7`(%rsp),%xmm13
605 movaps `$framesz+16*8`(%rsp),%xmm14
606 movaps `$framesz+16*9`(%rsp),%xmm15
618 .size ${func}_xop,.-${func}_xop
620 ######################################################################
623 local *ror = sub { &shrd(@_[0],@_) };
626 .type ${func}_avx,\@function,6
630 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
637 mov %rsp,%r11 # copy %rsp
638 sub \$`$framesz+$win64*16*10`,%rsp
639 and \$-64,%rsp # align stack frame
642 sub $inp,$out # re-bias
644 add $inp,$len # end of input
646 #mov $inp,$_inp # saved later
649 #mov $key,$_key # remains resident in $inp register
655 $code.=<<___ if ($win64);
656 movaps %xmm6,`$framesz+16*0`(%rsp)
657 movaps %xmm7,`$framesz+16*1`(%rsp)
658 movaps %xmm8,`$framesz+16*2`(%rsp)
659 movaps %xmm9,`$framesz+16*3`(%rsp)
660 movaps %xmm10,`$framesz+16*4`(%rsp)
661 movaps %xmm11,`$framesz+16*5`(%rsp)
662 movaps %xmm12,`$framesz+16*6`(%rsp)
663 movaps %xmm13,`$framesz+16*7`(%rsp)
664 movaps %xmm14,`$framesz+16*8`(%rsp)
665 movaps %xmm15,`$framesz+16*9`(%rsp)
671 mov $inp,%r12 # borrow $a4
672 lea 0x80($key),$inp # size optimization, reassign
673 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
674 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
675 mov $ctx,%r15 # borrow $a2
676 mov $in0,%rsi # borrow $a3
677 vmovdqu ($ivp),$iv # load IV
689 vmovdqa 0x00(%r13,%r14,8),$mask14
690 vmovdqa 0x10(%r13,%r14,8),$mask12
691 vmovdqa 0x20(%r13,%r14,8),$mask10
692 vmovdqu 0x00-0x80($inp),$roundkey
694 if ($SZ==4) { # SHA256
695 my @X = map("%xmm$_",(0..3));
696 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
702 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
703 vmovdqu 0x00(%rsi,%r12),@X[0]
704 vmovdqu 0x10(%rsi,%r12),@X[1]
705 vmovdqu 0x20(%rsi,%r12),@X[2]
706 vmovdqu 0x30(%rsi,%r12),@X[3]
707 vpshufb $t3,@X[0],@X[0]
708 lea $TABLE(%rip),$Tbl
709 vpshufb $t3,@X[1],@X[1]
710 vpshufb $t3,@X[2],@X[2]
711 vpaddd 0x00($Tbl),@X[0],$t0
712 vpshufb $t3,@X[3],@X[3]
713 vpaddd 0x20($Tbl),@X[1],$t1
714 vpaddd 0x40($Tbl),@X[2],$t2
715 vpaddd 0x60($Tbl),@X[3],$t3
716 vmovdqa $t0,0x00(%rsp)
718 vmovdqa $t1,0x10(%rsp)
720 vmovdqa $t2,0x20(%rsp)
722 vmovdqa $t3,0x30(%rsp)
728 sub \$-16*2*$SZ,$Tbl # size optimization
729 vmovdqu (%r12),$inout # $a4
732 sub Xupdate_256_AVX () {
734 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
735 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
736 '&vpsrld ($t2,$t0,$sigma0[0]);',
737 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
738 '&vpsrld ($t3,$t0,$sigma0[2])',
739 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
740 '&vpxor ($t0,$t3,$t2)',
741 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
742 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
743 '&vpxor ($t0,$t0,$t1)',
744 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
745 '&vpxor ($t0,$t0,$t2)',
746 '&vpsrld ($t2,$t3,$sigma1[2]);',
747 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
748 '&vpsrlq ($t3,$t3,$sigma1[0]);',
749 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
750 '&vpxor ($t2,$t2,$t3);',
751 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
752 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
753 '&vpshufd ($t2,$t2,0b10000100)',
754 '&vpsrldq ($t2,$t2,8)',
755 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
756 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
757 '&vpsrld ($t2,$t3,$sigma1[2])',
758 '&vpsrlq ($t3,$t3,$sigma1[0])',
759 '&vpxor ($t2,$t2,$t3);',
760 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
761 '&vpxor ($t2,$t2,$t3)',
762 '&vpshufd ($t2,$t2,0b11101000)',
763 '&vpslldq ($t2,$t2,8)',
764 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
768 sub AVX_256_00_47 () {
772 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
774 foreach (Xupdate_256_AVX()) { # 29 instructions
780 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
781 foreach (@insns) { eval; } # remaining instructions
782 &vmovdqa (16*$j."(%rsp)",$t2);
786 for ($i=0,$j=0; $j<4; $j++) {
787 &AVX_256_00_47($j,\&body_00_15,@X);
788 push(@X,shift(@X)); # rotate(@X)
790 &mov ("%r12",$_inp); # borrow $a4
791 &vpand ($temp,$temp,$mask14);
792 &mov ("%r15",$_out); # borrow $a2
793 &vpor ($iv,$iv,$temp);
794 &vmovdqu ("(%r15,%r12)",$iv); # write output
795 &lea ("%r12","16(%r12)"); # inp++
797 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
798 &jne (".Lavx_00_47");
800 &vmovdqu ($inout,"(%r12)");
804 for ($i=0; $i<16; ) {
805 foreach(body_00_15()) { eval; }
810 mov $_inp,%r12 # borrow $a4
811 mov $_out,%r13 # borrow $a0
812 mov $_ctx,%r15 # borrow $a2
813 mov $_in0,%rsi # borrow $a3
815 vpand $mask14,$temp,$temp
818 vmovdqu $iv,(%r13,%r12) # write output
819 lea 16(%r12),%r12 # inp++
844 vmovdqu $iv,($ivp) # output IV
847 $code.=<<___ if ($win64);
848 movaps `$framesz+16*0`(%rsp),%xmm6
849 movaps `$framesz+16*1`(%rsp),%xmm7
850 movaps `$framesz+16*2`(%rsp),%xmm8
851 movaps `$framesz+16*3`(%rsp),%xmm9
852 movaps `$framesz+16*4`(%rsp),%xmm10
853 movaps `$framesz+16*5`(%rsp),%xmm11
854 movaps `$framesz+16*6`(%rsp),%xmm12
855 movaps `$framesz+16*7`(%rsp),%xmm13
856 movaps `$framesz+16*8`(%rsp),%xmm14
857 movaps `$framesz+16*9`(%rsp),%xmm15
869 .size ${func}_avx,.-${func}_avx
873 ######################################################################
876 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
881 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
883 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
885 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
886 '&and ($a4,$e)', # f&e
887 '&rorx ($a0,$e,$Sigma1[2])',
888 '&rorx ($a2,$e,$Sigma1[1])',
890 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
891 '&lea ($h,"($h,$a4)")',
892 '&andn ($a4,$e,$g)', # ~e&g
895 '&rorx ($a1,$e,$Sigma1[0])',
896 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
897 '&xor ($a0,$a1)', # Sigma1(e)
900 '&rorx ($a4,$a,$Sigma0[2])',
901 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
902 '&xor ($a2,$b)', # a^b, b^c in next round
903 '&rorx ($a1,$a,$Sigma0[1])',
905 '&rorx ($a0,$a,$Sigma0[0])',
906 '&lea ($d,"($d,$h)")', # d+=h
907 '&and ($a3,$a2)', # (b^c)&(a^b)
908 @aesni_cbc_block[$aesni_cbc_idx++].
911 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
912 '&xor ($a1,$a0)', # Sigma0(a)
913 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
914 '&mov ($a4,$e)', # copy of f in future
916 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
918 # and at the finish one has to $a+=$a1
922 .type ${func}_avx2,\@function,6
926 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
933 mov %rsp,%r11 # copy %rsp
934 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
935 and \$-256*$SZ,%rsp # align stack frame
936 add \$`2*$SZ*($rounds-8)`,%rsp
939 sub $inp,$out # re-bias
941 add $inp,$len # end of input
943 #mov $inp,$_inp # saved later
944 #mov $out,$_out # kept in $offload
946 #mov $key,$_key # remains resident in $inp register
952 $code.=<<___ if ($win64);
953 movaps %xmm6,`$framesz+16*0`(%rsp)
954 movaps %xmm7,`$framesz+16*1`(%rsp)
955 movaps %xmm8,`$framesz+16*2`(%rsp)
956 movaps %xmm9,`$framesz+16*3`(%rsp)
957 movaps %xmm10,`$framesz+16*4`(%rsp)
958 movaps %xmm11,`$framesz+16*5`(%rsp)
959 movaps %xmm12,`$framesz+16*6`(%rsp)
960 movaps %xmm13,`$framesz+16*7`(%rsp)
961 movaps %xmm14,`$framesz+16*8`(%rsp)
962 movaps %xmm15,`$framesz+16*9`(%rsp)
968 mov $inp,%r13 # borrow $a0
969 vpinsrq \$1,$out,$offload,$offload
970 lea 0x80($key),$inp # size optimization, reassign
971 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
972 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
973 mov $ctx,%r15 # borrow $a2
974 mov $in0,%rsi # borrow $a3
975 vmovdqu ($ivp),$iv # load IV
978 vmovdqa 0x00(%r12,%r14,8),$mask14
979 vmovdqa 0x10(%r12,%r14,8),$mask12
980 vmovdqa 0x20(%r12,%r14,8),$mask10
982 sub \$-16*$SZ,%r13 # inp++, size optimization
984 lea (%rsi,%r13),%r12 # borrow $a0
986 cmp $len,%r13 # $_end
988 cmove %rsp,%r12 # next block or random data
994 vmovdqu 0x00-0x80($inp),$roundkey
996 if ($SZ==4) { # SHA256
997 my @X = map("%ymm$_",(0..3));
998 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1004 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1005 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1006 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1007 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1008 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1010 vinserti128 \$1,(%r12),@X[0],@X[0]
1011 vinserti128 \$1,16(%r12),@X[1],@X[1]
1012 vpshufb $t3,@X[0],@X[0]
1013 vinserti128 \$1,32(%r12),@X[2],@X[2]
1014 vpshufb $t3,@X[1],@X[1]
1015 vinserti128 \$1,48(%r12),@X[3],@X[3]
1017 lea $TABLE(%rip),$Tbl
1018 vpshufb $t3,@X[2],@X[2]
1019 lea -16*$SZ(%r13),%r13
1020 vpaddd 0x00($Tbl),@X[0],$t0
1021 vpshufb $t3,@X[3],@X[3]
1022 vpaddd 0x20($Tbl),@X[1],$t1
1023 vpaddd 0x40($Tbl),@X[2],$t2
1024 vpaddd 0x60($Tbl),@X[3],$t3
1025 vmovdqa $t0,0x00(%rsp)
1027 vmovdqa $t1,0x20(%rsp)
1028 lea -$PUSH8(%rsp),%rsp
1030 vmovdqa $t2,0x00(%rsp)
1032 vmovdqa $t3,0x20(%rsp)
1034 sub \$-16*2*$SZ,$Tbl # size optimization
1039 vmovdqu (%r13),$inout
1040 vpinsrq \$0,%r13,$offload,$offload
1043 sub AVX2_256_00_47 () {
1047 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1048 my $base = "+2*$PUSH8(%rsp)";
1050 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1051 foreach (Xupdate_256_AVX()) { # 29 instructions
1053 eval(shift(@insns));
1054 eval(shift(@insns));
1055 eval(shift(@insns));
1057 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1058 foreach (@insns) { eval; } # remaining instructions
1059 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1062 for ($i=0,$j=0; $j<4; $j++) {
1063 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1064 push(@X,shift(@X)); # rotate(@X)
1066 &vmovq ("%r13",$offload); # borrow $a0
1067 &vpextrq ("%r15",$offload,1); # borrow $a2
1068 &vpand ($temp,$temp,$mask14);
1069 &vpor ($iv,$iv,$temp);
1070 &vmovdqu ("(%r15,%r13)",$iv); # write output
1071 &lea ("%r13","16(%r13)"); # inp++
1073 &lea ($Tbl,16*2*$SZ."($Tbl)");
1074 &cmpb (($SZ-1)."($Tbl)",0);
1075 &jne (".Lavx2_00_47");
1077 &vmovdqu ($inout,"(%r13)");
1078 &vpinsrq ($offload,$offload,"%r13",0);
1081 for ($i=0; $i<16; ) {
1082 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1083 foreach(bodyx_00_15()) { eval; }
1087 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1088 vmovq $offload,%r13 # $_inp, borrow $a0
1089 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1091 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1093 vpand $mask14,$temp,$temp
1095 vmovdqu $iv,(%r12,%r13) # write output
1116 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1126 vmovdqu (%r13),$inout
1127 vpinsrq \$0,%r13,$offload,$offload
1130 for ($i=0; $i<16; ) {
1131 my $base="+16($Tbl)";
1132 foreach(bodyx_00_15()) { eval; }
1133 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1136 vmovq $offload,%r13 # borrow $a0
1137 vpextrq \$1,$offload,%r15 # borrow $a2
1138 vpand $mask14,$temp,$temp
1140 lea -$PUSH8($Tbl),$Tbl
1141 vmovdqu $iv,(%r15,%r13) # write output
1142 lea 16(%r13),%r13 # inp++
1146 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1147 lea 16*$SZ(%r13),%r13
1148 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1150 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1159 lea (%rsi,%r13),%r12
1165 cmove %rsp,%r12 # next block or stale data
1181 vmovdqu $iv,($ivp) # output IV
1184 $code.=<<___ if ($win64);
1185 movaps `$framesz+16*0`(%rsp),%xmm6
1186 movaps `$framesz+16*1`(%rsp),%xmm7
1187 movaps `$framesz+16*2`(%rsp),%xmm8
1188 movaps `$framesz+16*3`(%rsp),%xmm9
1189 movaps `$framesz+16*4`(%rsp),%xmm10
1190 movaps `$framesz+16*5`(%rsp),%xmm11
1191 movaps `$framesz+16*6`(%rsp),%xmm12
1192 movaps `$framesz+16*7`(%rsp),%xmm13
1193 movaps `$framesz+16*8`(%rsp),%xmm14
1194 movaps `$framesz+16*9`(%rsp),%xmm15
1206 .size ${func}_avx2,.-${func}_avx2
1211 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1213 my ($rounds,$Tbl)=("%r11d","%rbx");
1215 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1216 my @rndkey=("%xmm4","%xmm5");
1220 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1221 my @MSG=map("%xmm$_",(10..13));
1225 my ($n,$k)=($r/10,$r%10);
1228 movups `16*$n`($in0),$in # load input
1231 $code.=<<___ if ($n);
1232 movups $iv,`16*($n-1)`($out,$in0) # write output
1236 movups `32+16*$k-112`($key),$rndkey[1]
1237 aesenc $rndkey[0],$iv
1244 movups `32+16*($k+0)-112`($key),$rndkey[1]
1245 aesenc $rndkey[0],$iv
1246 movups `32+16*($k+1)-112`($key),$rndkey[0]
1247 aesenc $rndkey[1],$iv
1249 movups `32+16*($k+2)-112`($key),$rndkey[1]
1250 aesenc $rndkey[0],$iv
1251 movups `32+16*($k+3)-112`($key),$rndkey[0]
1252 aesenc $rndkey[1],$iv
1254 aesenclast $rndkey[0],$iv
1255 movups 16-112($key),$rndkey[1] # forward reference
1260 movups `32+16*$k-112`($key),$rndkey[1]
1261 aesenc $rndkey[0],$iv
1264 $r++; unshift(@rndkey,pop(@rndkey));
1271 .type ${func}_shaext,\@function,6
1274 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
1276 $code.=<<___ if ($win64);
1277 lea `-8-10*16`(%rsp),%rsp
1278 movaps %xmm6,-8-10*16(%rax)
1279 movaps %xmm7,-8-9*16(%rax)
1280 movaps %xmm8,-8-8*16(%rax)
1281 movaps %xmm9,-8-7*16(%rax)
1282 movaps %xmm10,-8-6*16(%rax)
1283 movaps %xmm11,-8-5*16(%rax)
1284 movaps %xmm12,-8-4*16(%rax)
1285 movaps %xmm13,-8-3*16(%rax)
1286 movaps %xmm14,-8-2*16(%rax)
1287 movaps %xmm15,-8-1*16(%rax)
1291 lea K256+0x80(%rip),$Tbl
1292 movdqu ($ctx),$ABEF # DCBA
1293 movdqu 16($ctx),$CDGH # HGFE
1294 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1296 mov 240($key),$rounds
1298 movups ($key),$rndkey0 # $key[0]
1299 movups 16($key),$rndkey[0] # forward reference
1300 lea 112($key),$key # size optimization
1302 pshufd \$0x1b,$ABEF,$Wi # ABCD
1303 pshufd \$0xb1,$ABEF,$ABEF # CDAB
1304 pshufd \$0x1b,$CDGH,$CDGH # EFGH
1305 movdqa $TMP,$BSWAP # offload
1306 palignr \$8,$CDGH,$ABEF # ABEF
1307 punpcklqdq $Wi,$CDGH # CDGH
1313 movdqu ($inp),@MSG[0]
1314 movdqu 0x10($inp),@MSG[1]
1315 movdqu 0x20($inp),@MSG[2]
1317 movdqu 0x30($inp),@MSG[3]
1319 movdqa 0*32-0x80($Tbl),$Wi
1322 movdqa $CDGH,$CDGH_SAVE # offload
1323 movdqa $ABEF,$ABEF_SAVE # offload
1327 sha256rnds2 $ABEF,$CDGH # 0-3
1328 pshufd \$0x0e,$Wi,$Wi
1332 sha256rnds2 $CDGH,$ABEF
1334 movdqa 1*32-0x80($Tbl),$Wi
1341 sha256rnds2 $ABEF,$CDGH # 4-7
1342 pshufd \$0x0e,$Wi,$Wi
1346 sha256rnds2 $CDGH,$ABEF
1348 movdqa 2*32-0x80($Tbl),$Wi
1351 sha256msg1 @MSG[1],@MSG[0]
1355 sha256rnds2 $ABEF,$CDGH # 8-11
1356 pshufd \$0x0e,$Wi,$Wi
1358 palignr \$4,@MSG[2],$TMP
1363 sha256rnds2 $CDGH,$ABEF
1365 movdqa 3*32-0x80($Tbl),$Wi
1367 sha256msg2 @MSG[3],@MSG[0]
1368 sha256msg1 @MSG[2],@MSG[1]
1372 sha256rnds2 $ABEF,$CDGH # 12-15
1373 pshufd \$0x0e,$Wi,$Wi
1378 palignr \$4,@MSG[3],$TMP
1380 sha256rnds2 $CDGH,$ABEF
1382 for($i=4;$i<16-3;$i++) {
1383 &$aesenc() if (($r%10)==0);
1385 movdqa $i*32-0x80($Tbl),$Wi
1387 sha256msg2 @MSG[0],@MSG[1]
1388 sha256msg1 @MSG[3],@MSG[2]
1392 sha256rnds2 $ABEF,$CDGH # 16-19...
1393 pshufd \$0x0e,$Wi,$Wi
1395 palignr \$4,@MSG[0],$TMP
1399 &$aesenc() if ($r==19);
1401 sha256rnds2 $CDGH,$ABEF
1403 push(@MSG,shift(@MSG));
1406 movdqa 13*32-0x80($Tbl),$Wi
1408 sha256msg2 @MSG[0],@MSG[1]
1409 sha256msg1 @MSG[3],@MSG[2]
1413 sha256rnds2 $ABEF,$CDGH # 52-55
1414 pshufd \$0x0e,$Wi,$Wi
1416 palignr \$4,@MSG[0],$TMP
1422 sha256rnds2 $CDGH,$ABEF
1424 movdqa 14*32-0x80($Tbl),$Wi
1426 sha256msg2 @MSG[1],@MSG[2]
1431 sha256rnds2 $ABEF,$CDGH # 56-59
1432 pshufd \$0x0e,$Wi,$Wi
1436 sha256rnds2 $CDGH,$ABEF
1438 movdqa 15*32-0x80($Tbl),$Wi
1444 sha256rnds2 $ABEF,$CDGH # 60-63
1445 pshufd \$0x0e,$Wi,$Wi
1449 sha256rnds2 $CDGH,$ABEF
1450 #pxor $CDGH,$rndkey0 # black magic
1452 while ($r<40) { &$aesenc(); } # remaining aesenc's
1454 #xorps $CDGH,$rndkey0 # black magic
1455 paddd $CDGH_SAVE,$CDGH
1456 paddd $ABEF_SAVE,$ABEF
1459 movups $iv,48($out,$in0) # write output
1463 pshufd \$0xb1,$CDGH,$CDGH # DCHG
1464 pshufd \$0x1b,$ABEF,$TMP # FEBA
1465 pshufd \$0xb1,$ABEF,$ABEF # BAFE
1466 punpckhqdq $CDGH,$ABEF # DCBA
1467 palignr \$8,$TMP,$CDGH # HGFE
1469 movups $iv,($ivp) # write IV
1471 movdqu $CDGH,16($ctx)
1473 $code.=<<___ if ($win64);
1474 movaps 0*16(%rsp),%xmm6
1475 movaps 1*16(%rsp),%xmm7
1476 movaps 2*16(%rsp),%xmm8
1477 movaps 3*16(%rsp),%xmm9
1478 movaps 4*16(%rsp),%xmm10
1479 movaps 5*16(%rsp),%xmm11
1480 movaps 6*16(%rsp),%xmm12
1481 movaps 7*16(%rsp),%xmm13
1482 movaps 8*16(%rsp),%xmm14
1483 movaps 9*16(%rsp),%xmm15
1484 lea 8+10*16(%rsp),%rsp
1489 .size ${func}_shaext,.-${func}_shaext
1494 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1495 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1502 $code.=<<___ if ($avx);
1503 .extern __imp_RtlVirtualUnwind
1504 .type se_handler,\@abi-omnipotent
1518 mov 120($context),%rax # pull context->Rax
1519 mov 248($context),%rbx # pull context->Rip
1521 mov 8($disp),%rsi # disp->ImageBase
1522 mov 56($disp),%r11 # disp->HanderlData
1524 mov 0(%r11),%r10d # HandlerData[0]
1525 lea (%rsi,%r10),%r10 # prologue label
1526 cmp %r10,%rbx # context->Rip<prologue label
1529 mov 152($context),%rax # pull context->Rsp
1531 mov 4(%r11),%r10d # HandlerData[1]
1532 lea (%rsi,%r10),%r10 # epilogue label
1533 cmp %r10,%rbx # context->Rip>=epilogue label
1536 $code.=<<___ if ($shaext);
1537 lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1542 lea 512($context),%rdi # &context.Xmm6
1544 .long 0xa548f3fc # cld; rep movsq
1545 lea 168(%rax),%rax # adjust stack pointer
1549 $code.=<<___ if ($avx>1);
1550 lea .Lavx2_shortcut(%rip),%r10
1551 cmp %r10,%rbx # context->Rip<avx2_shortcut
1555 add \$`2*$SZ*($rounds-8)`,%rax
1559 mov %rax,%rsi # put aside Rsp
1560 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1569 mov %rbx,144($context) # restore context->Rbx
1570 mov %rbp,160($context) # restore context->Rbp
1571 mov %r12,216($context) # restore context->R12
1572 mov %r13,224($context) # restore context->R13
1573 mov %r14,232($context) # restore context->R14
1574 mov %r15,240($context) # restore context->R15
1576 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1577 lea 512($context),%rdi # &context.Xmm6
1579 .long 0xa548f3fc # cld; rep movsq
1584 mov %rax,152($context) # restore context->Rsp
1585 mov %rsi,168($context) # restore context->Rsi
1586 mov %rdi,176($context) # restore context->Rdi
1588 mov 40($disp),%rdi # disp->ContextRecord
1589 mov $context,%rsi # context
1590 mov \$154,%ecx # sizeof(CONTEXT)
1591 .long 0xa548f3fc # cld; rep movsq
1594 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1595 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1596 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1597 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1598 mov 40(%rsi),%r10 # disp->ContextRecord
1599 lea 56(%rsi),%r11 # &disp->HandlerData
1600 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1601 mov %r10,32(%rsp) # arg5
1602 mov %r11,40(%rsp) # arg6
1603 mov %r12,48(%rsp) # arg7
1604 mov %rcx,56(%rsp) # arg8, (NULL)
1605 call *__imp_RtlVirtualUnwind(%rip)
1607 mov \$1,%eax # ExceptionContinueSearch
1619 .size se_handler,.-se_handler
1622 .rva .LSEH_begin_${func}_xop
1623 .rva .LSEH_end_${func}_xop
1624 .rva .LSEH_info_${func}_xop
1626 .rva .LSEH_begin_${func}_avx
1627 .rva .LSEH_end_${func}_avx
1628 .rva .LSEH_info_${func}_avx
1630 $code.=<<___ if ($avx>1);
1631 .rva .LSEH_begin_${func}_avx2
1632 .rva .LSEH_end_${func}_avx2
1633 .rva .LSEH_info_${func}_avx2
1635 $code.=<<___ if ($shaext);
1636 .rva .LSEH_begin_${func}_shaext
1637 .rva .LSEH_end_${func}_shaext
1638 .rva .LSEH_info_${func}_shaext
1640 $code.=<<___ if ($avx);
1643 .LSEH_info_${func}_xop:
1646 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1648 .LSEH_info_${func}_avx:
1651 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1653 $code.=<<___ if ($avx>1);
1654 .LSEH_info_${func}_avx2:
1657 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1659 $code.=<<___ if ($shaext);
1660 .LSEH_info_${func}_shaext:
1663 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1667 ####################################################################
1669 local *opcode=shift;
1673 $rex|=0x04 if($dst>=8);
1674 $rex|=0x01 if($src>=8);
1675 unshift @opcode,$rex|0x40 if($rex);
1680 "sha256rnds2" => 0xcb,
1681 "sha256msg1" => 0xcc,
1682 "sha256msg2" => 0xcd );
1687 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1688 my @opcode=(0x0f,0x38);
1689 rex(\@opcode,$2,$1);
1690 push @opcode,$opcodelet{$instr};
1691 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1692 return ".byte\t".join(',',@opcode);
1694 return $instr."\t".@_[0];
1699 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1700 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;