2 # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
20 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
21 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
22 # parallelism, interleaving it with another algorithm would allow to
23 # utilize processor resources better and achieve better performance.
24 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
25 # AESNI code is weaved into it. As SHA256 dominates execution time,
26 # stitch performance does not depend on AES key length. Below are
27 # performance numbers in cycles per processed byte, less is better,
28 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
31 # AES-128/-192/-256+SHA256 this(**) gain
32 # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
33 # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
34 # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
35 # Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
36 # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
37 # Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54%
38 # Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60%
40 # (*) there are XOP, AVX1 and AVX2 code paths, meaning that
41 # Westmere is omitted from loop, this is because gain was not
42 # estimated high enough to justify the effort;
43 # (**) these are EVP-free results, results obtained with 'speed
44 # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
45 # (***) these are SHAEXT results;
47 # $output is the last argument if it looks like a file (it has an extension)
48 # $flavour is the first argument if it doesn't look like a file
49 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
50 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
52 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
54 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
55 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
56 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
57 die "can't locate x86_64-xlate.pl";
59 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
60 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
61 $avx = ($1>=2.19) + ($1>=2.22);
64 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
65 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
66 $avx = ($1>=2.09) + ($1>=2.10);
69 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
70 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
71 $avx = ($1>=10) + ($1>=12);
74 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
75 $avx = ($2>=3.0) + ($2>3.0);
78 $shaext=$avx; ### set to zero if compiling for 1.0.1
79 $avx=1 if (!$shaext && $avx);
81 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
82 or die "can't call $xlate: $!";
85 $func="aesni_cbc_sha256_enc";
88 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
89 "%r8d","%r9d","%r10d","%r11d");
90 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
97 ########################################################################
98 # void aesni_cbc_sha256_enc(const void *inp,
101 # const AES_KEY *key,
105 ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
106 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
110 $_inp="16*$SZ+0*8(%rsp)";
111 $_out="16*$SZ+1*8(%rsp)";
112 $_end="16*$SZ+2*8(%rsp)";
113 $_key="16*$SZ+3*8(%rsp)";
114 $_ivp="16*$SZ+4*8(%rsp)";
115 $_ctx="16*$SZ+5*8(%rsp)";
116 $_in0="16*$SZ+6*8(%rsp)";
117 $_rsp="`16*$SZ+7*8`(%rsp)";
123 .extern OPENSSL_ia32cap_P
125 .type $func,\@abi-omnipotent
132 lea OPENSSL_ia32cap_P(%rip),%r11
134 cmp \$0,`$win64?"%rcx":"%rdi"`
139 $code.=<<___ if ($shaext);
140 bt \$61,%r10 # check for SHA
147 test \$`1<<11`,%r10d # check for XOP
150 $code.=<<___ if ($avx>1);
151 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
152 cmp \$`1<<8|1<<5|1<<3`,%r11d
156 and \$`1<<28`,%r10d # check for AVX
163 cmp \$0,`$win64?"%rcx":"%rdi"`
172 .type $TABLE,\@object
174 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
175 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
176 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
177 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
178 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
179 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
180 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
181 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
182 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
183 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
184 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
185 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
186 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
187 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
188 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
189 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
190 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
191 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
192 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
193 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
194 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
195 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
196 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
197 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
198 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
199 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
200 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
201 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
202 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
203 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
204 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
205 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
207 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
208 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
209 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
210 .long 0,0,0,0, 0,0,0,0
211 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
215 ######################################################################
219 ($iv,$inout,$roundkey,$temp,
220 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
224 ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
225 ## &vmovdqu ($inout,($inp));
226 ## &mov ($_inp,$inp);
228 '&vpxor ($inout,$inout,$roundkey);'.
229 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
231 '&vpxor ($inout,$inout,$iv);',
233 '&vaesenc ($inout,$inout,$roundkey);'.
234 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
236 '&vaesenc ($inout,$inout,$roundkey);'.
237 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
239 '&vaesenc ($inout,$inout,$roundkey);'.
240 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
242 '&vaesenc ($inout,$inout,$roundkey);'.
243 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
245 '&vaesenc ($inout,$inout,$roundkey);'.
246 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
248 '&vaesenc ($inout,$inout,$roundkey);'.
249 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
251 '&vaesenc ($inout,$inout,$roundkey);'.
252 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
254 '&vaesenc ($inout,$inout,$roundkey);'.
255 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
257 '&vaesenc ($inout,$inout,$roundkey);'.
258 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
260 '&vaesenclast ($temp,$inout,$roundkey);'.
261 ' &vaesenc ($inout,$inout,$roundkey);'.
262 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
264 '&vpand ($iv,$temp,$mask10);'.
265 ' &vaesenc ($inout,$inout,$roundkey);'.
266 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
268 '&vaesenclast ($temp,$inout,$roundkey);'.
269 ' &vaesenc ($inout,$inout,$roundkey);'.
270 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
272 '&vpand ($temp,$temp,$mask12);'.
273 ' &vaesenc ($inout,$inout,$roundkey);'.
274 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
276 '&vpor ($iv,$iv,$temp);'.
277 ' &vaesenclast ($temp,$inout,$roundkey);'.
278 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
280 ## &mov ($inp,$_inp);
281 ## &mov ($out,$_out);
282 ## &vpand ($temp,$temp,$mask14);
283 ## &vpor ($iv,$iv,$temp);
284 ## &vmovdqu ($iv,($out,$inp);
285 ## &lea (inp,16($inp));
289 my ($a,$b,$c,$d,$e,$f,$g,$h);
291 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
292 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
294 $arg = "\$$arg" if ($arg*1 eq $arg);
295 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
300 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
302 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
307 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
308 '&xor ($a4,$g)', # f^g
310 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
312 '&and ($a4,$e)', # (f^g)&e
314 @aesni_cbc_block[$aesni_cbc_idx++].
316 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
319 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
320 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
321 '&xor ($a2,$b)', # a^b, b^c in next round
323 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
324 '&add ($h,$a4)', # h+=Ch(e,f,g)
325 '&and ($a3,$a2)', # (b^c)&(a^b)
328 '&add ($h,$a0)', # h+=Sigma1(e)
329 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
331 '&add ($d,$h)', # d+=h
332 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
333 '&add ($h,$a3)', # h+=Maj(a,b,c)
336 '&add ($a1,$h);'. # h+=Sigma0(a)
337 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
342 ######################################################################
346 .type ${func}_xop,\@function,6
351 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
352 mov %rsp,%rax # copy %rsp
353 .cfi_def_cfa_register %rax
366 sub \$`$framesz+$win64*16*10`,%rsp
367 and \$-64,%rsp # align stack frame
370 sub $inp,$out # re-bias
372 add $inp,$len # end of input
374 #mov $inp,$_inp # saved later
377 #mov $key,$_key # remains resident in $inp register
382 .cfi_cfa_expression $_rsp,deref,+8
384 $code.=<<___ if ($win64);
385 movaps %xmm6,`$framesz+16*0`(%rsp)
386 movaps %xmm7,`$framesz+16*1`(%rsp)
387 movaps %xmm8,`$framesz+16*2`(%rsp)
388 movaps %xmm9,`$framesz+16*3`(%rsp)
389 movaps %xmm10,`$framesz+16*4`(%rsp)
390 movaps %xmm11,`$framesz+16*5`(%rsp)
391 movaps %xmm12,`$framesz+16*6`(%rsp)
392 movaps %xmm13,`$framesz+16*7`(%rsp)
393 movaps %xmm14,`$framesz+16*8`(%rsp)
394 movaps %xmm15,`$framesz+16*9`(%rsp)
400 mov $inp,%r12 # borrow $a4
401 lea 0x80($key),$inp # size optimization, reassign
402 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
403 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
404 mov $ctx,%r15 # borrow $a2
405 mov $in0,%rsi # borrow $a3
406 vmovdqu ($ivp),$iv # load IV
418 vmovdqa 0x00(%r13,%r14,8),$mask14
419 vmovdqa 0x10(%r13,%r14,8),$mask12
420 vmovdqa 0x20(%r13,%r14,8),$mask10
421 vmovdqu 0x00-0x80($inp),$roundkey
424 if ($SZ==4) { # SHA256
425 my @X = map("%xmm$_",(0..3));
426 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
431 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
432 vmovdqu 0x00(%rsi,%r12),@X[0]
433 vmovdqu 0x10(%rsi,%r12),@X[1]
434 vmovdqu 0x20(%rsi,%r12),@X[2]
435 vmovdqu 0x30(%rsi,%r12),@X[3]
436 vpshufb $t3,@X[0],@X[0]
437 lea $TABLE(%rip),$Tbl
438 vpshufb $t3,@X[1],@X[1]
439 vpshufb $t3,@X[2],@X[2]
440 vpaddd 0x00($Tbl),@X[0],$t0
441 vpshufb $t3,@X[3],@X[3]
442 vpaddd 0x20($Tbl),@X[1],$t1
443 vpaddd 0x40($Tbl),@X[2],$t2
444 vpaddd 0x60($Tbl),@X[3],$t3
445 vmovdqa $t0,0x00(%rsp)
447 vmovdqa $t1,0x10(%rsp)
449 vmovdqa $t2,0x20(%rsp)
451 vmovdqa $t3,0x30(%rsp)
457 sub \$-16*2*$SZ,$Tbl # size optimization
458 vmovdqu (%r12),$inout # $a4
461 sub XOP_256_00_47 () {
465 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
467 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
470 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
473 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
476 &vpsrld ($t0,$t0,$sigma0[2]);
479 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
484 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
487 &vpxor ($t0,$t0,$t1);
492 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
495 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
498 &vpsrld ($t2,@X[3],$sigma1[2]);
501 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
504 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
507 &vpxor ($t3,$t3,$t2);
512 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
517 &vpsrldq ($t3,$t3,8);
522 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
527 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
530 &vpsrld ($t2,@X[0],$sigma1[2]);
533 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
536 &vpxor ($t3,$t3,$t2);
541 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
546 &vpslldq ($t3,$t3,8); # 22 instructions
551 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
556 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
557 foreach (@insns) { eval; } # remaining instructions
558 &vmovdqa (16*$j."(%rsp)",$t2);
562 for ($i=0,$j=0; $j<4; $j++) {
563 &XOP_256_00_47($j,\&body_00_15,@X);
564 push(@X,shift(@X)); # rotate(@X)
566 &mov ("%r12",$_inp); # borrow $a4
567 &vpand ($temp,$temp,$mask14);
568 &mov ("%r15",$_out); # borrow $a2
569 &vpor ($iv,$iv,$temp);
570 &vmovdqu ("(%r15,%r12)",$iv); # write output
571 &lea ("%r12","16(%r12)"); # inp++
573 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
574 &jne (".Lxop_00_47");
576 &vmovdqu ($inout,"(%r12)");
580 for ($i=0; $i<16; ) {
581 foreach(body_00_15()) { eval; }
585 mov $_inp,%r12 # borrow $a4
586 mov $_out,%r13 # borrow $a0
587 mov $_ctx,%r15 # borrow $a2
588 mov $_in0,%rsi # borrow $a3
590 vpand $mask14,$temp,$temp
593 vmovdqu $iv,(%r13,%r12) # write output
594 lea 16(%r12),%r12 # inp++
621 vmovdqu $iv,($ivp) # output IV
624 $code.=<<___ if ($win64);
625 movaps `$framesz+16*0`(%rsp),%xmm6
626 movaps `$framesz+16*1`(%rsp),%xmm7
627 movaps `$framesz+16*2`(%rsp),%xmm8
628 movaps `$framesz+16*3`(%rsp),%xmm9
629 movaps `$framesz+16*4`(%rsp),%xmm10
630 movaps `$framesz+16*5`(%rsp),%xmm11
631 movaps `$framesz+16*6`(%rsp),%xmm12
632 movaps `$framesz+16*7`(%rsp),%xmm13
633 movaps `$framesz+16*8`(%rsp),%xmm14
634 movaps `$framesz+16*9`(%rsp),%xmm15
650 .cfi_def_cfa_register %rsp
654 .size ${func}_xop,.-${func}_xop
656 ######################################################################
659 local *ror = sub { &shrd(@_[0],@_) };
662 .type ${func}_avx,\@function,6
667 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
668 mov %rsp,%rax # copy %rsp
669 .cfi_def_cfa_register %rax
682 sub \$`$framesz+$win64*16*10`,%rsp
683 and \$-64,%rsp # align stack frame
686 sub $inp,$out # re-bias
688 add $inp,$len # end of input
690 #mov $inp,$_inp # saved later
693 #mov $key,$_key # remains resident in $inp register
698 .cfi_cfa_expression $_rsp,deref,+8
700 $code.=<<___ if ($win64);
701 movaps %xmm6,`$framesz+16*0`(%rsp)
702 movaps %xmm7,`$framesz+16*1`(%rsp)
703 movaps %xmm8,`$framesz+16*2`(%rsp)
704 movaps %xmm9,`$framesz+16*3`(%rsp)
705 movaps %xmm10,`$framesz+16*4`(%rsp)
706 movaps %xmm11,`$framesz+16*5`(%rsp)
707 movaps %xmm12,`$framesz+16*6`(%rsp)
708 movaps %xmm13,`$framesz+16*7`(%rsp)
709 movaps %xmm14,`$framesz+16*8`(%rsp)
710 movaps %xmm15,`$framesz+16*9`(%rsp)
716 mov $inp,%r12 # borrow $a4
717 lea 0x80($key),$inp # size optimization, reassign
718 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
719 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
720 mov $ctx,%r15 # borrow $a2
721 mov $in0,%rsi # borrow $a3
722 vmovdqu ($ivp),$iv # load IV
734 vmovdqa 0x00(%r13,%r14,8),$mask14
735 vmovdqa 0x10(%r13,%r14,8),$mask12
736 vmovdqa 0x20(%r13,%r14,8),$mask10
737 vmovdqu 0x00-0x80($inp),$roundkey
739 if ($SZ==4) { # SHA256
740 my @X = map("%xmm$_",(0..3));
741 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
747 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
748 vmovdqu 0x00(%rsi,%r12),@X[0]
749 vmovdqu 0x10(%rsi,%r12),@X[1]
750 vmovdqu 0x20(%rsi,%r12),@X[2]
751 vmovdqu 0x30(%rsi,%r12),@X[3]
752 vpshufb $t3,@X[0],@X[0]
753 lea $TABLE(%rip),$Tbl
754 vpshufb $t3,@X[1],@X[1]
755 vpshufb $t3,@X[2],@X[2]
756 vpaddd 0x00($Tbl),@X[0],$t0
757 vpshufb $t3,@X[3],@X[3]
758 vpaddd 0x20($Tbl),@X[1],$t1
759 vpaddd 0x40($Tbl),@X[2],$t2
760 vpaddd 0x60($Tbl),@X[3],$t3
761 vmovdqa $t0,0x00(%rsp)
763 vmovdqa $t1,0x10(%rsp)
765 vmovdqa $t2,0x20(%rsp)
767 vmovdqa $t3,0x30(%rsp)
773 sub \$-16*2*$SZ,$Tbl # size optimization
774 vmovdqu (%r12),$inout # $a4
777 sub Xupdate_256_AVX () {
779 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
780 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
781 '&vpsrld ($t2,$t0,$sigma0[0]);',
782 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
783 '&vpsrld ($t3,$t0,$sigma0[2])',
784 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
785 '&vpxor ($t0,$t3,$t2)',
786 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
787 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
788 '&vpxor ($t0,$t0,$t1)',
789 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
790 '&vpxor ($t0,$t0,$t2)',
791 '&vpsrld ($t2,$t3,$sigma1[2]);',
792 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
793 '&vpsrlq ($t3,$t3,$sigma1[0]);',
794 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
795 '&vpxor ($t2,$t2,$t3);',
796 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
797 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
798 '&vpshufd ($t2,$t2,0b10000100)',
799 '&vpsrldq ($t2,$t2,8)',
800 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
801 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
802 '&vpsrld ($t2,$t3,$sigma1[2])',
803 '&vpsrlq ($t3,$t3,$sigma1[0])',
804 '&vpxor ($t2,$t2,$t3);',
805 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
806 '&vpxor ($t2,$t2,$t3)',
807 '&vpshufd ($t2,$t2,0b11101000)',
808 '&vpslldq ($t2,$t2,8)',
809 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
813 sub AVX_256_00_47 () {
817 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
819 foreach (Xupdate_256_AVX()) { # 29 instructions
825 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
826 foreach (@insns) { eval; } # remaining instructions
827 &vmovdqa (16*$j."(%rsp)",$t2);
831 for ($i=0,$j=0; $j<4; $j++) {
832 &AVX_256_00_47($j,\&body_00_15,@X);
833 push(@X,shift(@X)); # rotate(@X)
835 &mov ("%r12",$_inp); # borrow $a4
836 &vpand ($temp,$temp,$mask14);
837 &mov ("%r15",$_out); # borrow $a2
838 &vpor ($iv,$iv,$temp);
839 &vmovdqu ("(%r15,%r12)",$iv); # write output
840 &lea ("%r12","16(%r12)"); # inp++
842 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
843 &jne (".Lavx_00_47");
845 &vmovdqu ($inout,"(%r12)");
849 for ($i=0; $i<16; ) {
850 foreach(body_00_15()) { eval; }
855 mov $_inp,%r12 # borrow $a4
856 mov $_out,%r13 # borrow $a0
857 mov $_ctx,%r15 # borrow $a2
858 mov $_in0,%rsi # borrow $a3
860 vpand $mask14,$temp,$temp
863 vmovdqu $iv,(%r13,%r12) # write output
864 lea 16(%r12),%r12 # inp++
890 vmovdqu $iv,($ivp) # output IV
893 $code.=<<___ if ($win64);
894 movaps `$framesz+16*0`(%rsp),%xmm6
895 movaps `$framesz+16*1`(%rsp),%xmm7
896 movaps `$framesz+16*2`(%rsp),%xmm8
897 movaps `$framesz+16*3`(%rsp),%xmm9
898 movaps `$framesz+16*4`(%rsp),%xmm10
899 movaps `$framesz+16*5`(%rsp),%xmm11
900 movaps `$framesz+16*6`(%rsp),%xmm12
901 movaps `$framesz+16*7`(%rsp),%xmm13
902 movaps `$framesz+16*8`(%rsp),%xmm14
903 movaps `$framesz+16*9`(%rsp),%xmm15
919 .cfi_def_cfa_register %rsp
923 .size ${func}_avx,.-${func}_avx
927 ######################################################################
930 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
935 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
937 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
939 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
940 '&and ($a4,$e)', # f&e
941 '&rorx ($a0,$e,$Sigma1[2])',
942 '&rorx ($a2,$e,$Sigma1[1])',
944 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
945 '&lea ($h,"($h,$a4)")',
946 '&andn ($a4,$e,$g)', # ~e&g
949 '&rorx ($a1,$e,$Sigma1[0])',
950 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
951 '&xor ($a0,$a1)', # Sigma1(e)
954 '&rorx ($a4,$a,$Sigma0[2])',
955 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
956 '&xor ($a2,$b)', # a^b, b^c in next round
957 '&rorx ($a1,$a,$Sigma0[1])',
959 '&rorx ($a0,$a,$Sigma0[0])',
960 '&lea ($d,"($d,$h)")', # d+=h
961 '&and ($a3,$a2)', # (b^c)&(a^b)
962 @aesni_cbc_block[$aesni_cbc_idx++].
965 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
966 '&xor ($a1,$a0)', # Sigma0(a)
967 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
968 '&mov ($a4,$e)', # copy of f in future
970 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
972 # and at the finish one has to $a+=$a1
976 .type ${func}_avx2,\@function,6
981 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
982 mov %rsp,%rax # copy %rsp
983 .cfi_def_cfa_register %rax
996 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
997 and \$-256*$SZ,%rsp # align stack frame
998 add \$`2*$SZ*($rounds-8)`,%rsp
1001 sub $inp,$out # re-bias
1003 add $inp,$len # end of input
1005 #mov $inp,$_inp # saved later
1006 #mov $out,$_out # kept in $offload
1008 #mov $key,$_key # remains resident in $inp register
1013 .cfi_cfa_expression $_rsp,deref,+8
1015 $code.=<<___ if ($win64);
1016 movaps %xmm6,`$framesz+16*0`(%rsp)
1017 movaps %xmm7,`$framesz+16*1`(%rsp)
1018 movaps %xmm8,`$framesz+16*2`(%rsp)
1019 movaps %xmm9,`$framesz+16*3`(%rsp)
1020 movaps %xmm10,`$framesz+16*4`(%rsp)
1021 movaps %xmm11,`$framesz+16*5`(%rsp)
1022 movaps %xmm12,`$framesz+16*6`(%rsp)
1023 movaps %xmm13,`$framesz+16*7`(%rsp)
1024 movaps %xmm14,`$framesz+16*8`(%rsp)
1025 movaps %xmm15,`$framesz+16*9`(%rsp)
1031 mov $inp,%r13 # borrow $a0
1032 vpinsrq \$1,$out,$offload,$offload
1033 lea 0x80($key),$inp # size optimization, reassign
1034 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
1035 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
1036 mov $ctx,%r15 # borrow $a2
1037 mov $in0,%rsi # borrow $a3
1038 vmovdqu ($ivp),$iv # load IV
1041 vmovdqa 0x00(%r12,%r14,8),$mask14
1042 vmovdqa 0x10(%r12,%r14,8),$mask12
1043 vmovdqa 0x20(%r12,%r14,8),$mask10
1045 sub \$-16*$SZ,%r13 # inp++, size optimization
1047 lea (%rsi,%r13),%r12 # borrow $a0
1049 cmp $len,%r13 # $_end
1051 cmove %rsp,%r12 # next block or random data
1057 vmovdqu 0x00-0x80($inp),$roundkey
1059 if ($SZ==4) { # SHA256
1060 my @X = map("%ymm$_",(0..3));
1061 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1067 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1068 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1069 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1070 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1071 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1073 vinserti128 \$1,(%r12),@X[0],@X[0]
1074 vinserti128 \$1,16(%r12),@X[1],@X[1]
1075 vpshufb $t3,@X[0],@X[0]
1076 vinserti128 \$1,32(%r12),@X[2],@X[2]
1077 vpshufb $t3,@X[1],@X[1]
1078 vinserti128 \$1,48(%r12),@X[3],@X[3]
1080 lea $TABLE(%rip),$Tbl
1081 vpshufb $t3,@X[2],@X[2]
1082 lea -16*$SZ(%r13),%r13
1083 vpaddd 0x00($Tbl),@X[0],$t0
1084 vpshufb $t3,@X[3],@X[3]
1085 vpaddd 0x20($Tbl),@X[1],$t1
1086 vpaddd 0x40($Tbl),@X[2],$t2
1087 vpaddd 0x60($Tbl),@X[3],$t3
1088 vmovdqa $t0,0x00(%rsp)
1090 vmovdqa $t1,0x20(%rsp)
1092 $code.=<<___ if (!$win64);
1093 # temporarily use %rsi as frame pointer
1098 lea -$PUSH8(%rsp),%rsp
1100 $code.=<<___ if (!$win64);
1101 # the frame info is at $_rsp, but the stack is moving...
1102 # so a second frame pointer is saved at -8(%rsp)
1103 # that is in the red zone
1105 .cfi_cfa_expression %rsp-8,deref,+8
1109 vmovdqa $t2,0x00(%rsp)
1111 vmovdqa $t3,0x20(%rsp)
1113 sub \$-16*2*$SZ,$Tbl # size optimization
1118 vmovdqu (%r13),$inout
1119 vpinsrq \$0,%r13,$offload,$offload
1122 sub AVX2_256_00_47 () {
1126 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1127 my $base = "+2*$PUSH8(%rsp)";
1130 &lea ("%rsp","-$PUSH8(%rsp)");
1131 $code.=<<___ if (!$win64);
1132 .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8
1133 # copy secondary frame pointer to new location again at -8(%rsp)
1134 pushq $PUSH8-8(%rsp)
1135 .cfi_cfa_expression %rsp,deref,+8
1137 .cfi_cfa_expression %rsp-8,deref,+8
1140 foreach (Xupdate_256_AVX()) { # 29 instructions
1142 eval(shift(@insns));
1143 eval(shift(@insns));
1144 eval(shift(@insns));
1146 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1147 foreach (@insns) { eval; } # remaining instructions
1148 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1151 for ($i=0,$j=0; $j<4; $j++) {
1152 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1153 push(@X,shift(@X)); # rotate(@X)
1155 &vmovq ("%r13",$offload); # borrow $a0
1156 &vpextrq ("%r15",$offload,1); # borrow $a2
1157 &vpand ($temp,$temp,$mask14);
1158 &vpor ($iv,$iv,$temp);
1159 &vmovdqu ("(%r15,%r13)",$iv); # write output
1160 &lea ("%r13","16(%r13)"); # inp++
1162 &lea ($Tbl,16*2*$SZ."($Tbl)");
1163 &cmpb (($SZ-1)."($Tbl)",0);
1164 &jne (".Lavx2_00_47");
1166 &vmovdqu ($inout,"(%r13)");
1167 &vpinsrq ($offload,$offload,"%r13",0);
1170 for ($i=0; $i<16; ) {
1171 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1172 foreach(bodyx_00_15()) { eval; }
1176 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1177 vmovq $offload,%r13 # $_inp, borrow $a0
1178 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1180 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1182 vpand $mask14,$temp,$temp
1184 vmovdqu $iv,(%r12,%r13) # write output
1205 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1215 vmovdqu (%r13),$inout
1216 vpinsrq \$0,%r13,$offload,$offload
1219 for ($i=0; $i<16; ) {
1220 my $base="+16($Tbl)";
1221 foreach(bodyx_00_15()) { eval; }
1222 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1225 vmovq $offload,%r13 # borrow $a0
1226 vpextrq \$1,$offload,%r15 # borrow $a2
1227 vpand $mask14,$temp,$temp
1229 lea -$PUSH8($Tbl),$Tbl
1230 vmovdqu $iv,(%r15,%r13) # write output
1231 lea 16(%r13),%r13 # inp++
1235 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1236 lea 16*$SZ(%r13),%r13
1237 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1239 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1248 lea (%rsi,%r13),%r12
1254 cmove %rsp,%r12 # next block or stale data
1265 # temporarily use $Tbl as index to $_rsp
1266 # this avoids the need to save a secondary frame pointer at -8(%rsp)
1267 .cfi_cfa_expression $Tbl+`16*$SZ+7*8`,deref,+8
1270 mov 16*$SZ+4*8($Tbl),$ivp
1271 mov 16*$SZ+7*8($Tbl),%rsi
1273 vmovdqu $iv,($ivp) # output IV
1276 $code.=<<___ if ($win64);
1277 movaps `$framesz+16*0`($Tbl),%xmm6
1278 movaps `$framesz+16*1`($Tbl),%xmm7
1279 movaps `$framesz+16*2`($Tbl),%xmm8
1280 movaps `$framesz+16*3`($Tbl),%xmm9
1281 movaps `$framesz+16*4`($Tbl),%xmm10
1282 movaps `$framesz+16*5`($Tbl),%xmm11
1283 movaps `$framesz+16*6`($Tbl),%xmm12
1284 movaps `$framesz+16*7`($Tbl),%xmm13
1285 movaps `$framesz+16*8`($Tbl),%xmm14
1286 movaps `$framesz+16*9`($Tbl),%xmm15
1302 .cfi_def_cfa_register %rsp
1306 .size ${func}_avx2,.-${func}_avx2
1311 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1313 my ($rounds,$Tbl)=("%r11d","%rbx");
1315 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1316 my @rndkey=("%xmm4","%xmm5");
1320 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1321 my @MSG=map("%xmm$_",(10..13));
1325 my ($n,$k)=($r/10,$r%10);
1328 movups `16*$n`($in0),$in # load input
1331 $code.=<<___ if ($n);
1332 movups $iv,`16*($n-1)`($out,$in0) # write output
1336 movups `32+16*$k-112`($key),$rndkey[1]
1337 aesenc $rndkey[0],$iv
1344 movups `32+16*($k+0)-112`($key),$rndkey[1]
1345 aesenc $rndkey[0],$iv
1346 movups `32+16*($k+1)-112`($key),$rndkey[0]
1347 aesenc $rndkey[1],$iv
1349 movups `32+16*($k+2)-112`($key),$rndkey[1]
1350 aesenc $rndkey[0],$iv
1351 movups `32+16*($k+3)-112`($key),$rndkey[0]
1352 aesenc $rndkey[1],$iv
1354 aesenclast $rndkey[0],$iv
1355 movups 16-112($key),$rndkey[1] # forward reference
1360 movups `32+16*$k-112`($key),$rndkey[1]
1361 aesenc $rndkey[0],$iv
1364 $r++; unshift(@rndkey,pop(@rndkey));
1371 .type ${func}_shaext,\@function,6
1375 mov `($win64?56:8)`(%rsp),$inp # load 7th argument
1377 $code.=<<___ if ($win64);
1378 lea `-8-10*16`(%rsp),%rsp
1379 movaps %xmm6,-8-10*16(%rax)
1380 movaps %xmm7,-8-9*16(%rax)
1381 movaps %xmm8,-8-8*16(%rax)
1382 movaps %xmm9,-8-7*16(%rax)
1383 movaps %xmm10,-8-6*16(%rax)
1384 movaps %xmm11,-8-5*16(%rax)
1385 movaps %xmm12,-8-4*16(%rax)
1386 movaps %xmm13,-8-3*16(%rax)
1387 movaps %xmm14,-8-2*16(%rax)
1388 movaps %xmm15,-8-1*16(%rax)
1392 lea K256+0x80(%rip),$Tbl
1393 movdqu ($ctx),$ABEF # DCBA
1394 movdqu 16($ctx),$CDGH # HGFE
1395 movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
1397 mov 240($key),$rounds
1399 movups ($key),$rndkey0 # $key[0]
1400 movups ($ivp),$iv # load IV
1401 movups 16($key),$rndkey[0] # forward reference
1402 lea 112($key),$key # size optimization
1404 pshufd \$0x1b,$ABEF,$Wi # ABCD
1405 pshufd \$0xb1,$ABEF,$ABEF # CDAB
1406 pshufd \$0x1b,$CDGH,$CDGH # EFGH
1407 movdqa $TMP,$BSWAP # offload
1408 palignr \$8,$CDGH,$ABEF # ABEF
1409 punpcklqdq $Wi,$CDGH # CDGH
1415 movdqu ($inp),@MSG[0]
1416 movdqu 0x10($inp),@MSG[1]
1417 movdqu 0x20($inp),@MSG[2]
1419 movdqu 0x30($inp),@MSG[3]
1421 movdqa 0*32-0x80($Tbl),$Wi
1424 movdqa $CDGH,$CDGH_SAVE # offload
1425 movdqa $ABEF,$ABEF_SAVE # offload
1429 sha256rnds2 $ABEF,$CDGH # 0-3
1430 pshufd \$0x0e,$Wi,$Wi
1434 sha256rnds2 $CDGH,$ABEF
1436 movdqa 1*32-0x80($Tbl),$Wi
1443 sha256rnds2 $ABEF,$CDGH # 4-7
1444 pshufd \$0x0e,$Wi,$Wi
1448 sha256rnds2 $CDGH,$ABEF
1450 movdqa 2*32-0x80($Tbl),$Wi
1453 sha256msg1 @MSG[1],@MSG[0]
1457 sha256rnds2 $ABEF,$CDGH # 8-11
1458 pshufd \$0x0e,$Wi,$Wi
1460 palignr \$4,@MSG[2],$TMP
1465 sha256rnds2 $CDGH,$ABEF
1467 movdqa 3*32-0x80($Tbl),$Wi
1469 sha256msg2 @MSG[3],@MSG[0]
1470 sha256msg1 @MSG[2],@MSG[1]
1474 sha256rnds2 $ABEF,$CDGH # 12-15
1475 pshufd \$0x0e,$Wi,$Wi
1480 palignr \$4,@MSG[3],$TMP
1482 sha256rnds2 $CDGH,$ABEF
1484 for($i=4;$i<16-3;$i++) {
1485 &$aesenc() if (($r%10)==0);
1487 movdqa $i*32-0x80($Tbl),$Wi
1489 sha256msg2 @MSG[0],@MSG[1]
1490 sha256msg1 @MSG[3],@MSG[2]
1494 sha256rnds2 $ABEF,$CDGH # 16-19...
1495 pshufd \$0x0e,$Wi,$Wi
1497 palignr \$4,@MSG[0],$TMP
1501 &$aesenc() if ($r==19);
1503 sha256rnds2 $CDGH,$ABEF
1505 push(@MSG,shift(@MSG));
1508 movdqa 13*32-0x80($Tbl),$Wi
1510 sha256msg2 @MSG[0],@MSG[1]
1511 sha256msg1 @MSG[3],@MSG[2]
1515 sha256rnds2 $ABEF,$CDGH # 52-55
1516 pshufd \$0x0e,$Wi,$Wi
1518 palignr \$4,@MSG[0],$TMP
1524 sha256rnds2 $CDGH,$ABEF
1526 movdqa 14*32-0x80($Tbl),$Wi
1528 sha256msg2 @MSG[1],@MSG[2]
1533 sha256rnds2 $ABEF,$CDGH # 56-59
1534 pshufd \$0x0e,$Wi,$Wi
1538 sha256rnds2 $CDGH,$ABEF
1540 movdqa 15*32-0x80($Tbl),$Wi
1546 sha256rnds2 $ABEF,$CDGH # 60-63
1547 pshufd \$0x0e,$Wi,$Wi
1551 sha256rnds2 $CDGH,$ABEF
1552 #pxor $CDGH,$rndkey0 # black magic
1554 while ($r<40) { &$aesenc(); } # remaining aesenc's
1556 #xorps $CDGH,$rndkey0 # black magic
1557 paddd $CDGH_SAVE,$CDGH
1558 paddd $ABEF_SAVE,$ABEF
1561 movups $iv,48($out,$in0) # write output
1565 pshufd \$0xb1,$CDGH,$CDGH # DCHG
1566 pshufd \$0x1b,$ABEF,$TMP # FEBA
1567 pshufd \$0xb1,$ABEF,$ABEF # BAFE
1568 punpckhqdq $CDGH,$ABEF # DCBA
1569 palignr \$8,$TMP,$CDGH # HGFE
1571 movups $iv,($ivp) # write IV
1573 movdqu $CDGH,16($ctx)
1575 $code.=<<___ if ($win64);
1576 movaps 0*16(%rsp),%xmm6
1577 movaps 1*16(%rsp),%xmm7
1578 movaps 2*16(%rsp),%xmm8
1579 movaps 3*16(%rsp),%xmm9
1580 movaps 4*16(%rsp),%xmm10
1581 movaps 5*16(%rsp),%xmm11
1582 movaps 6*16(%rsp),%xmm12
1583 movaps 7*16(%rsp),%xmm13
1584 movaps 8*16(%rsp),%xmm14
1585 movaps 9*16(%rsp),%xmm15
1586 lea 8+10*16(%rsp),%rsp
1592 .size ${func}_shaext,.-${func}_shaext
1597 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1598 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1599 if ($win64 && $avx) {
1606 .extern __imp_RtlVirtualUnwind
1607 .type se_handler,\@abi-omnipotent
1621 mov 120($context),%rax # pull context->Rax
1622 mov 248($context),%rbx # pull context->Rip
1624 mov 8($disp),%rsi # disp->ImageBase
1625 mov 56($disp),%r11 # disp->HanderlData
1627 mov 0(%r11),%r10d # HandlerData[0]
1628 lea (%rsi,%r10),%r10 # prologue label
1629 cmp %r10,%rbx # context->Rip<prologue label
1632 mov 152($context),%rax # pull context->Rsp
1634 mov 4(%r11),%r10d # HandlerData[1]
1635 lea (%rsi,%r10),%r10 # epilogue label
1636 cmp %r10,%rbx # context->Rip>=epilogue label
1639 $code.=<<___ if ($shaext);
1640 lea aesni_cbc_sha256_enc_shaext(%rip),%r10
1645 lea 512($context),%rdi # &context.Xmm6
1647 .long 0xa548f3fc # cld; rep movsq
1648 lea 168(%rax),%rax # adjust stack pointer
1652 $code.=<<___ if ($avx>1);
1653 lea .Lavx2_shortcut(%rip),%r10
1654 cmp %r10,%rbx # context->Rip<avx2_shortcut
1658 add \$`2*$SZ*($rounds-8)`,%rax
1662 mov %rax,%rsi # put aside Rsp
1663 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1671 mov %rbx,144($context) # restore context->Rbx
1672 mov %rbp,160($context) # restore context->Rbp
1673 mov %r12,216($context) # restore context->R12
1674 mov %r13,224($context) # restore context->R13
1675 mov %r14,232($context) # restore context->R14
1676 mov %r15,240($context) # restore context->R15
1678 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1679 lea 512($context),%rdi # &context.Xmm6
1681 .long 0xa548f3fc # cld; rep movsq
1686 mov %rax,152($context) # restore context->Rsp
1687 mov %rsi,168($context) # restore context->Rsi
1688 mov %rdi,176($context) # restore context->Rdi
1690 mov 40($disp),%rdi # disp->ContextRecord
1691 mov $context,%rsi # context
1692 mov \$154,%ecx # sizeof(CONTEXT)
1693 .long 0xa548f3fc # cld; rep movsq
1696 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1697 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1698 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1699 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1700 mov 40(%rsi),%r10 # disp->ContextRecord
1701 lea 56(%rsi),%r11 # &disp->HandlerData
1702 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1703 mov %r10,32(%rsp) # arg5
1704 mov %r11,40(%rsp) # arg6
1705 mov %r12,48(%rsp) # arg7
1706 mov %rcx,56(%rsp) # arg8, (NULL)
1707 call *__imp_RtlVirtualUnwind(%rip)
1709 mov \$1,%eax # ExceptionContinueSearch
1721 .size se_handler,.-se_handler
1724 .rva .LSEH_begin_${func}_xop
1725 .rva .LSEH_end_${func}_xop
1726 .rva .LSEH_info_${func}_xop
1728 .rva .LSEH_begin_${func}_avx
1729 .rva .LSEH_end_${func}_avx
1730 .rva .LSEH_info_${func}_avx
1732 $code.=<<___ if ($avx>1);
1733 .rva .LSEH_begin_${func}_avx2
1734 .rva .LSEH_end_${func}_avx2
1735 .rva .LSEH_info_${func}_avx2
1737 $code.=<<___ if ($shaext);
1738 .rva .LSEH_begin_${func}_shaext
1739 .rva .LSEH_end_${func}_shaext
1740 .rva .LSEH_info_${func}_shaext
1745 .LSEH_info_${func}_xop:
1748 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1750 .LSEH_info_${func}_avx:
1753 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1755 $code.=<<___ if ($avx>1);
1756 .LSEH_info_${func}_avx2:
1759 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1761 $code.=<<___ if ($shaext);
1762 .LSEH_info_${func}_shaext:
1765 .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
1769 ####################################################################
1771 local *opcode=shift;
1775 $rex|=0x04 if($dst>=8);
1776 $rex|=0x01 if($src>=8);
1777 unshift @opcode,$rex|0x40 if($rex);
1782 "sha256rnds2" => 0xcb,
1783 "sha256msg1" => 0xcc,
1784 "sha256msg2" => 0xcd );
1789 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1790 my @opcode=(0x0f,0x38);
1791 rex(\@opcode,$2,$1);
1792 push @opcode,$opcodelet{$instr};
1793 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
1794 return ".byte\t".join(',',@opcode);
1796 return $instr."\t".@_[0];
1801 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1802 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1804 close STDOUT or die "error closing STDOUT: $!";