3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
15 # parallelism, interleaving it with another algorithm would allow to
16 # utilize processor resources better and achieve better performance.
17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18 # AESNI code is weaved into it. As SHA256 dominates execution time,
19 # stitch performance does not depend on AES key length. Below are
20 # performance numbers in cycles per processed byte, less is better,
21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
24 # AES-128/-192/-256+SHA256 this(**)gain
25 # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
26 # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
27 # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
28 # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
30 # (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
31 # Westmere is omitted from loop, this is because gain was not
32 # estimated high enough to justify the effort;
33 # (**) these are EVP-free results, results obtained with 'speed
34 # -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
38 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
40 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
44 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
45 die "can't locate x86_64-xlate.pl";
47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49 $avx = ($1>=2.19) + ($1>=2.22);
52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54 $avx = ($1>=2.09) + ($1>=2.10);
57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59 $avx = ($1>=10) + ($1>=11);
62 open OUT,"| \"$^X\" $xlate $flavour $output";
65 $func="aesni_cbc_sha256_enc";
68 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
69 "%r8d","%r9d","%r10d","%r11d");
70 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
77 ########################################################################
78 # void aesni_cbc_sha256_enc(const void *inp,
85 ($inp, $out, $len, $key, $ivp, $ctx, $in0) =
86 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
90 $_inp="16*$SZ+0*8(%rsp)";
91 $_out="16*$SZ+1*8(%rsp)";
92 $_end="16*$SZ+2*8(%rsp)";
93 $_key="16*$SZ+3*8(%rsp)";
94 $_ivp="16*$SZ+4*8(%rsp)";
95 $_ctx="16*$SZ+5*8(%rsp)";
96 $_in0="16*$SZ+6*8(%rsp)";
97 $_rsp="16*$SZ+7*8(%rsp)";
103 .extern OPENSSL_ia32cap_P
105 .type $func,\@abi-omnipotent
109 $code.=<<___ if ($avx);
110 lea OPENSSL_ia32cap_P(%rip),%r11
112 cmp \$0,`$win64?"%rcx":"%rdi"`
118 test \$`1<<11`,%r10d # check for XOP
121 $code.=<<___ if ($avx>1);
122 and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
123 cmp \$`1<<8|1<<5|1<<3`,%r11d
126 $code.=<<___ if ($avx);
127 and \$`1<<30`,%eax # mask "Intel CPU" bit
128 and \$`1<<28|1<<9`,%r10d # mask AVX+SSSE3 bits
130 cmp \$`1<<28|1<<9|1<<30`,%r10d
136 cmp \$0,`$win64?"%rcx":"%rdi"`
144 .type $TABLE,\@object
146 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
147 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
148 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
149 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
150 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
151 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
152 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
153 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
154 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
155 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
156 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
157 .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
158 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
159 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
160 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
161 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
162 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
163 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
164 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
165 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
166 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
167 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
168 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
169 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
170 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
171 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
172 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
173 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
174 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
175 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
176 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
177 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
179 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
180 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
181 .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1
182 .long 0,0,0,0, 0,0,0,0
183 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
187 ######################################################################
191 ($iv,$inout,$roundkey,$temp,
192 $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
196 ## &vmovdqu ($roundkey,"0x00-0x80($inp)");'
197 ## &vmovdqu ($inout,($inp));
198 ## &mov ($_inp,$inp);
200 '&vpxor ($inout,$inout,$roundkey);'.
201 ' &vmovdqu ($roundkey,"0x10-0x80($inp)");',
203 '&vpxor ($inout,$inout,$iv);',
205 '&vaesenc ($inout,$inout,$roundkey);'.
206 ' &vmovdqu ($roundkey,"0x20-0x80($inp)");',
208 '&vaesenc ($inout,$inout,$roundkey);'.
209 ' &vmovdqu ($roundkey,"0x30-0x80($inp)");',
211 '&vaesenc ($inout,$inout,$roundkey);'.
212 ' &vmovdqu ($roundkey,"0x40-0x80($inp)");',
214 '&vaesenc ($inout,$inout,$roundkey);'.
215 ' &vmovdqu ($roundkey,"0x50-0x80($inp)");',
217 '&vaesenc ($inout,$inout,$roundkey);'.
218 ' &vmovdqu ($roundkey,"0x60-0x80($inp)");',
220 '&vaesenc ($inout,$inout,$roundkey);'.
221 ' &vmovdqu ($roundkey,"0x70-0x80($inp)");',
223 '&vaesenc ($inout,$inout,$roundkey);'.
224 ' &vmovdqu ($roundkey,"0x80-0x80($inp)");',
226 '&vaesenc ($inout,$inout,$roundkey);'.
227 ' &vmovdqu ($roundkey,"0x90-0x80($inp)");',
229 '&vaesenc ($inout,$inout,$roundkey);'.
230 ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");',
232 '&vaesenclast ($temp,$inout,$roundkey);'.
233 ' &vaesenc ($inout,$inout,$roundkey);'.
234 ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");',
236 '&vpand ($iv,$temp,$mask10);'.
237 ' &vaesenc ($inout,$inout,$roundkey);'.
238 ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");',
240 '&vaesenclast ($temp,$inout,$roundkey);'.
241 ' &vaesenc ($inout,$inout,$roundkey);'.
242 ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");',
244 '&vpand ($temp,$temp,$mask12);'.
245 ' &vaesenc ($inout,$inout,$roundkey);'.
246 '&vmovdqu ($roundkey,"0xe0-0x80($inp)");',
248 '&vpor ($iv,$iv,$temp);'.
249 ' &vaesenclast ($temp,$inout,$roundkey);'.
250 ' &vmovdqu ($roundkey,"0x00-0x80($inp)");'
252 ## &mov ($inp,$_inp);
253 ## &mov ($out,$_out);
254 ## &vpand ($temp,$temp,$mask14);
255 ## &vpor ($iv,$iv,$temp);
256 ## &vmovdqu ($iv,($out,$inp);
257 ## &lea (inp,16($inp));
261 my ($a,$b,$c,$d,$e,$f,$g,$h);
263 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
264 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
266 $arg = "\$$arg" if ($arg*1 eq $arg);
267 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
272 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
274 '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
279 '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
280 '&xor ($a4,$g)', # f^g
282 '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
284 '&and ($a4,$e)', # (f^g)&e
286 @aesni_cbc_block[$aesni_cbc_idx++].
288 '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
291 '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
292 '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
293 '&xor ($a2,$b)', # a^b, b^c in next round
295 '&ror ($a0,$Sigma1[0])', # Sigma1(e)
296 '&add ($h,$a4)', # h+=Ch(e,f,g)
297 '&and ($a3,$a2)', # (b^c)&(a^b)
300 '&add ($h,$a0)', # h+=Sigma1(e)
301 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
303 '&add ($d,$h)', # d+=h
304 '&ror ($a1,$Sigma0[0])', # Sigma0(a)
305 '&add ($h,$a3)', # h+=Maj(a,b,c)
308 '&add ($a1,$h);'. # h+=Sigma0(a)
309 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
314 ######################################################################
318 .type ${func}_xop,\@function,6
322 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
329 mov %rsp,%r11 # copy %rsp
330 sub \$`$framesz+$win64*16*10`,%rsp
331 and \$-64,%rsp # align stack frame
334 sub $inp,$out # re-bias
336 add $inp,$len # end of input
338 #mov $inp,$_inp # saved later
341 #mov $key,$_key # remains resident in $inp register
347 $code.=<<___ if ($win64);
348 movaps %xmm6,`$framesz+16*0`(%rsp)
349 movaps %xmm7,`$framesz+16*1`(%rsp)
350 movaps %xmm8,`$framesz+16*2`(%rsp)
351 movaps %xmm9,`$framesz+16*3`(%rsp)
352 movaps %xmm10,`$framesz+16*4`(%rsp)
353 movaps %xmm11,`$framesz+16*5`(%rsp)
354 movaps %xmm12,`$framesz+16*6`(%rsp)
355 movaps %xmm13,`$framesz+16*7`(%rsp)
356 movaps %xmm14,`$framesz+16*8`(%rsp)
357 movaps %xmm15,`$framesz+16*9`(%rsp)
363 mov $inp,%r12 # borrow $a4
364 lea 0x80($key),$inp # size optimization, reassign
365 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
366 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
367 mov $ctx,%r15 # borrow $a2
368 mov $in0,%rsi # borrow $a3
369 vmovdqu ($ivp),$iv # load IV
381 vmovdqa 0x00(%r13,%r14,8),$mask14
382 vmovdqa 0x10(%r13,%r14,8),$mask12
383 vmovdqa 0x20(%r13,%r14,8),$mask10
384 vmovdqu 0x00-0x80($inp),$roundkey
387 if ($SZ==4) { # SHA256
388 my @X = map("%xmm$_",(0..3));
389 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
394 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
395 vmovdqu 0x00(%rsi,%r12),@X[0]
396 vmovdqu 0x10(%rsi,%r12),@X[1]
397 vmovdqu 0x20(%rsi,%r12),@X[2]
398 vmovdqu 0x30(%rsi,%r12),@X[3]
399 vpshufb $t3,@X[0],@X[0]
400 lea $TABLE(%rip),$Tbl
401 vpshufb $t3,@X[1],@X[1]
402 vpshufb $t3,@X[2],@X[2]
403 vpaddd 0x00($Tbl),@X[0],$t0
404 vpshufb $t3,@X[3],@X[3]
405 vpaddd 0x20($Tbl),@X[1],$t1
406 vpaddd 0x40($Tbl),@X[2],$t2
407 vpaddd 0x60($Tbl),@X[3],$t3
408 vmovdqa $t0,0x00(%rsp)
410 vmovdqa $t1,0x10(%rsp)
412 vmovdqa $t2,0x20(%rsp)
414 vmovdqa $t3,0x30(%rsp)
420 sub \$-16*2*$SZ,$Tbl # size optimization
421 vmovdqu (%r12),$inout # $a4
424 sub XOP_256_00_47 () {
428 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
430 &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
433 &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
436 &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
439 &vpsrld ($t0,$t0,$sigma0[2]);
442 &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
447 &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
450 &vpxor ($t0,$t0,$t1);
455 &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
458 &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
461 &vpsrld ($t2,@X[3],$sigma1[2]);
464 &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
467 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
470 &vpxor ($t3,$t3,$t2);
475 &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
480 &vpsrldq ($t3,$t3,8);
485 &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
490 &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
493 &vpsrld ($t2,@X[0],$sigma1[2]);
496 &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
499 &vpxor ($t3,$t3,$t2);
504 &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
509 &vpslldq ($t3,$t3,8); # 22 instructions
514 &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
519 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
520 foreach (@insns) { eval; } # remaining instructions
521 &vmovdqa (16*$j."(%rsp)",$t2);
525 for ($i=0,$j=0; $j<4; $j++) {
526 &XOP_256_00_47($j,\&body_00_15,@X);
527 push(@X,shift(@X)); # rotate(@X)
529 &mov ("%r12",$_inp); # borrow $a4
530 &vpand ($temp,$temp,$mask14);
531 &mov ("%r15",$_out); # borrow $a2
532 &vpor ($iv,$iv,$temp);
533 &vmovdqu ("(%r15,%r12)",$iv); # write output
534 &lea ("%r12","16(%r12)"); # inp++
536 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
537 &jne (".Lxop_00_47");
539 &vmovdqu ($inout,"(%r12)");
543 for ($i=0; $i<16; ) {
544 foreach(body_00_15()) { eval; }
548 mov $_inp,%r12 # borrow $a4
549 mov $_out,%r13 # borrow $a0
550 mov $_ctx,%r15 # borrow $a2
551 mov $_in0,%rsi # borrow $a3
553 vpand $mask14,$temp,$temp
556 vmovdqu $iv,(%r13,%r12) # write output
557 lea 16(%r12),%r12 # inp++
583 vmovdqu $iv,($ivp) # output IV
586 $code.=<<___ if ($win64);
587 movaps `$framesz+16*0`(%rsp),%xmm6
588 movaps `$framesz+16*1`(%rsp),%xmm7
589 movaps `$framesz+16*2`(%rsp),%xmm8
590 movaps `$framesz+16*3`(%rsp),%xmm9
591 movaps `$framesz+16*4`(%rsp),%xmm10
592 movaps `$framesz+16*5`(%rsp),%xmm11
593 movaps `$framesz+16*6`(%rsp),%xmm12
594 movaps `$framesz+16*7`(%rsp),%xmm13
595 movaps `$framesz+16*8`(%rsp),%xmm14
596 movaps `$framesz+16*9`(%rsp),%xmm15
608 .size ${func}_xop,.-${func}_xop
610 ######################################################################
613 local *ror = sub { &shrd(@_[0],@_) };
616 .type ${func}_avx,\@function,6
620 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
627 mov %rsp,%r11 # copy %rsp
628 sub \$`$framesz+$win64*16*10`,%rsp
629 and \$-64,%rsp # align stack frame
632 sub $inp,$out # re-bias
634 add $inp,$len # end of input
636 #mov $inp,$_inp # saved later
639 #mov $key,$_key # remains resident in $inp register
645 $code.=<<___ if ($win64);
646 movaps %xmm6,`$framesz+16*0`(%rsp)
647 movaps %xmm7,`$framesz+16*1`(%rsp)
648 movaps %xmm8,`$framesz+16*2`(%rsp)
649 movaps %xmm9,`$framesz+16*3`(%rsp)
650 movaps %xmm10,`$framesz+16*4`(%rsp)
651 movaps %xmm11,`$framesz+16*5`(%rsp)
652 movaps %xmm12,`$framesz+16*6`(%rsp)
653 movaps %xmm13,`$framesz+16*7`(%rsp)
654 movaps %xmm14,`$framesz+16*8`(%rsp)
655 movaps %xmm15,`$framesz+16*9`(%rsp)
661 mov $inp,%r12 # borrow $a4
662 lea 0x80($key),$inp # size optimization, reassign
663 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0
664 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
665 mov $ctx,%r15 # borrow $a2
666 mov $in0,%rsi # borrow $a3
667 vmovdqu ($ivp),$iv # load IV
679 vmovdqa 0x00(%r13,%r14,8),$mask14
680 vmovdqa 0x10(%r13,%r14,8),$mask12
681 vmovdqa 0x20(%r13,%r14,8),$mask10
682 vmovdqu 0x00-0x80($inp),$roundkey
684 if ($SZ==4) { # SHA256
685 my @X = map("%xmm$_",(0..3));
686 my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
692 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
693 vmovdqu 0x00(%rsi,%r12),@X[0]
694 vmovdqu 0x10(%rsi,%r12),@X[1]
695 vmovdqu 0x20(%rsi,%r12),@X[2]
696 vmovdqu 0x30(%rsi,%r12),@X[3]
697 vpshufb $t3,@X[0],@X[0]
698 lea $TABLE(%rip),$Tbl
699 vpshufb $t3,@X[1],@X[1]
700 vpshufb $t3,@X[2],@X[2]
701 vpaddd 0x00($Tbl),@X[0],$t0
702 vpshufb $t3,@X[3],@X[3]
703 vpaddd 0x20($Tbl),@X[1],$t1
704 vpaddd 0x40($Tbl),@X[2],$t2
705 vpaddd 0x60($Tbl),@X[3],$t3
706 vmovdqa $t0,0x00(%rsp)
708 vmovdqa $t1,0x10(%rsp)
710 vmovdqa $t2,0x20(%rsp)
712 vmovdqa $t3,0x30(%rsp)
718 sub \$-16*2*$SZ,$Tbl # size optimization
719 vmovdqu (%r12),$inout # $a4
722 sub Xupdate_256_AVX () {
724 '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
725 '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
726 '&vpsrld ($t2,$t0,$sigma0[0]);',
727 '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
728 '&vpsrld ($t3,$t0,$sigma0[2])',
729 '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
730 '&vpxor ($t0,$t3,$t2)',
731 '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
732 '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
733 '&vpxor ($t0,$t0,$t1)',
734 '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
735 '&vpxor ($t0,$t0,$t2)',
736 '&vpsrld ($t2,$t3,$sigma1[2]);',
737 '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
738 '&vpsrlq ($t3,$t3,$sigma1[0]);',
739 '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
740 '&vpxor ($t2,$t2,$t3);',
741 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
742 '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15])
743 '&vpshufd ($t2,$t2,0b10000100)',
744 '&vpsrldq ($t2,$t2,8)',
745 '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
746 '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
747 '&vpsrld ($t2,$t3,$sigma1[2])',
748 '&vpsrlq ($t3,$t3,$sigma1[0])',
749 '&vpxor ($t2,$t2,$t3);',
750 '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
751 '&vpxor ($t2,$t2,$t3)',
752 '&vpshufd ($t2,$t2,0b11101000)',
753 '&vpslldq ($t2,$t2,8)',
754 '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
758 sub AVX_256_00_47 () {
762 my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
764 foreach (Xupdate_256_AVX()) { # 29 instructions
770 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
771 foreach (@insns) { eval; } # remaining instructions
772 &vmovdqa (16*$j."(%rsp)",$t2);
776 for ($i=0,$j=0; $j<4; $j++) {
777 &AVX_256_00_47($j,\&body_00_15,@X);
778 push(@X,shift(@X)); # rotate(@X)
780 &mov ("%r12",$_inp); # borrow $a4
781 &vpand ($temp,$temp,$mask14);
782 &mov ("%r15",$_out); # borrow $a2
783 &vpor ($iv,$iv,$temp);
784 &vmovdqu ("(%r15,%r12)",$iv); # write output
785 &lea ("%r12","16(%r12)"); # inp++
787 &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
788 &jne (".Lavx_00_47");
790 &vmovdqu ($inout,"(%r12)");
794 for ($i=0; $i<16; ) {
795 foreach(body_00_15()) { eval; }
800 mov $_inp,%r12 # borrow $a4
801 mov $_out,%r13 # borrow $a0
802 mov $_ctx,%r15 # borrow $a2
803 mov $_in0,%rsi # borrow $a3
805 vpand $mask14,$temp,$temp
808 vmovdqu $iv,(%r13,%r12) # write output
809 lea 16(%r12),%r12 # inp++
834 vmovdqu $iv,($ivp) # output IV
837 $code.=<<___ if ($win64);
838 movaps `$framesz+16*0`(%rsp),%xmm6
839 movaps `$framesz+16*1`(%rsp),%xmm7
840 movaps `$framesz+16*2`(%rsp),%xmm8
841 movaps `$framesz+16*3`(%rsp),%xmm9
842 movaps `$framesz+16*4`(%rsp),%xmm10
843 movaps `$framesz+16*5`(%rsp),%xmm11
844 movaps `$framesz+16*6`(%rsp),%xmm12
845 movaps `$framesz+16*7`(%rsp),%xmm13
846 movaps `$framesz+16*8`(%rsp),%xmm14
847 movaps `$framesz+16*9`(%rsp),%xmm15
859 .size ${func}_avx,.-${func}_avx
863 ######################################################################
866 my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
871 # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
873 '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
875 '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
876 '&and ($a4,$e)', # f&e
877 '&rorx ($a0,$e,$Sigma1[2])',
878 '&rorx ($a2,$e,$Sigma1[1])',
880 '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
881 '&lea ($h,"($h,$a4)")',
882 '&andn ($a4,$e,$g)', # ~e&g
885 '&rorx ($a1,$e,$Sigma1[0])',
886 '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
887 '&xor ($a0,$a1)', # Sigma1(e)
890 '&rorx ($a4,$a,$Sigma0[2])',
891 '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
892 '&xor ($a2,$b)', # a^b, b^c in next round
893 '&rorx ($a1,$a,$Sigma0[1])',
895 '&rorx ($a0,$a,$Sigma0[0])',
896 '&lea ($d,"($d,$h)")', # d+=h
897 '&and ($a3,$a2)', # (b^c)&(a^b)
898 @aesni_cbc_block[$aesni_cbc_idx++].
901 '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
902 '&xor ($a1,$a0)', # Sigma0(a)
903 '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
904 '&mov ($a4,$e)', # copy of f in future
906 '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
908 # and at the finish one has to $a+=$a1
912 .type ${func}_avx2,\@function,6
916 mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter
923 mov %rsp,%r11 # copy %rsp
924 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
925 and \$-256*$SZ,%rsp # align stack frame
926 add \$`2*$SZ*($rounds-8)`,%rsp
929 sub $inp,$out # re-bias
931 add $inp,$len # end of input
933 #mov $inp,$_inp # saved later
934 #mov $out,$_out # kept in $offload
936 #mov $key,$_key # remains resident in $inp register
942 $code.=<<___ if ($win64);
943 movaps %xmm6,`$framesz+16*0`(%rsp)
944 movaps %xmm7,`$framesz+16*1`(%rsp)
945 movaps %xmm8,`$framesz+16*2`(%rsp)
946 movaps %xmm9,`$framesz+16*3`(%rsp)
947 movaps %xmm10,`$framesz+16*4`(%rsp)
948 movaps %xmm11,`$framesz+16*5`(%rsp)
949 movaps %xmm12,`$framesz+16*6`(%rsp)
950 movaps %xmm13,`$framesz+16*7`(%rsp)
951 movaps %xmm14,`$framesz+16*8`(%rsp)
952 movaps %xmm15,`$framesz+16*9`(%rsp)
958 mov $inp,%r13 # borrow $a0
959 vpinsrq \$1,$out,$offload,$offload
960 lea 0x80($key),$inp # size optimization, reassign
961 lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4
962 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1
963 mov $ctx,%r15 # borrow $a2
964 mov $in0,%rsi # borrow $a3
965 vmovdqu ($ivp),$iv # load IV
968 vmovdqa 0x00(%r12,%r14,8),$mask14
969 vmovdqa 0x10(%r12,%r14,8),$mask12
970 vmovdqa 0x20(%r12,%r14,8),$mask10
972 sub \$-16*$SZ,%r13 # inp++, size optimization
974 lea (%rsi,%r13),%r12 # borrow $a0
976 cmp $len,%r13 # $_end
978 cmove %rsp,%r12 # next block or random data
984 vmovdqu 0x00-0x80($inp),$roundkey
986 if ($SZ==4) { # SHA256
987 my @X = map("%ymm$_",(0..3));
988 my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
994 vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
995 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
996 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
997 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
998 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1000 vinserti128 \$1,(%r12),@X[0],@X[0]
1001 vinserti128 \$1,16(%r12),@X[1],@X[1]
1002 vpshufb $t3,@X[0],@X[0]
1003 vinserti128 \$1,32(%r12),@X[2],@X[2]
1004 vpshufb $t3,@X[1],@X[1]
1005 vinserti128 \$1,48(%r12),@X[3],@X[3]
1007 lea $TABLE(%rip),$Tbl
1008 vpshufb $t3,@X[2],@X[2]
1009 lea -16*$SZ(%r13),%r13
1010 vpaddd 0x00($Tbl),@X[0],$t0
1011 vpshufb $t3,@X[3],@X[3]
1012 vpaddd 0x20($Tbl),@X[1],$t1
1013 vpaddd 0x40($Tbl),@X[2],$t2
1014 vpaddd 0x60($Tbl),@X[3],$t3
1015 vmovdqa $t0,0x00(%rsp)
1017 vmovdqa $t1,0x20(%rsp)
1018 lea -$PUSH8(%rsp),%rsp
1020 vmovdqa $t2,0x00(%rsp)
1022 vmovdqa $t3,0x20(%rsp)
1024 sub \$-16*2*$SZ,$Tbl # size optimization
1029 vmovdqu (%r13),$inout
1030 vpinsrq \$0,%r13,$offload,$offload
1033 sub AVX2_256_00_47 () {
1037 my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
1038 my $base = "+2*$PUSH8(%rsp)";
1040 &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
1041 foreach (Xupdate_256_AVX()) { # 29 instructions
1043 eval(shift(@insns));
1044 eval(shift(@insns));
1045 eval(shift(@insns));
1047 &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
1048 foreach (@insns) { eval; } # remaining instructions
1049 &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
1052 for ($i=0,$j=0; $j<4; $j++) {
1053 &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1054 push(@X,shift(@X)); # rotate(@X)
1056 &vmovq ("%r13",$offload); # borrow $a0
1057 &vpextrq ("%r15",$offload,1); # borrow $a2
1058 &vpand ($temp,$temp,$mask14);
1059 &vpor ($iv,$iv,$temp);
1060 &vmovdqu ("(%r15,%r13)",$iv); # write output
1061 &lea ("%r13","16(%r13)"); # inp++
1063 &lea ($Tbl,16*2*$SZ."($Tbl)");
1064 &cmpb (($SZ-1)."($Tbl)",0);
1065 &jne (".Lavx2_00_47");
1067 &vmovdqu ($inout,"(%r13)");
1068 &vpinsrq ($offload,$offload,"%r13",0);
1071 for ($i=0; $i<16; ) {
1072 my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1073 foreach(bodyx_00_15()) { eval; }
1077 vpextrq \$1,$offload,%r12 # $_out, borrow $a4
1078 vmovq $offload,%r13 # $_inp, borrow $a0
1079 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1081 lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
1083 vpand $mask14,$temp,$temp
1085 vmovdqu $iv,(%r12,%r13) # write output
1106 cmp `$PUSH8+2*8`($Tbl),%r13 # $_end
1116 vmovdqu (%r13),$inout
1117 vpinsrq \$0,%r13,$offload,$offload
1120 for ($i=0; $i<16; ) {
1121 my $base="+16($Tbl)";
1122 foreach(bodyx_00_15()) { eval; }
1123 &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8);
1126 vmovq $offload,%r13 # borrow $a0
1127 vpextrq \$1,$offload,%r15 # borrow $a2
1128 vpand $mask14,$temp,$temp
1130 lea -$PUSH8($Tbl),$Tbl
1131 vmovdqu $iv,(%r15,%r13) # write output
1132 lea 16(%r13),%r13 # inp++
1136 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2
1137 lea 16*$SZ(%r13),%r13
1138 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3
1140 lea `2*$SZ*($rounds-8)`(%rsp),%rsp
1149 lea (%rsi,%r13),%r12
1155 cmove %rsp,%r12 # next block or stale data
1171 vmovdqu $iv,($ivp) # output IV
1174 $code.=<<___ if ($win64);
1175 movaps `$framesz+16*0`(%rsp),%xmm6
1176 movaps `$framesz+16*1`(%rsp),%xmm7
1177 movaps `$framesz+16*2`(%rsp),%xmm8
1178 movaps `$framesz+16*3`(%rsp),%xmm9
1179 movaps `$framesz+16*4`(%rsp),%xmm10
1180 movaps `$framesz+16*5`(%rsp),%xmm11
1181 movaps `$framesz+16*6`(%rsp),%xmm12
1182 movaps `$framesz+16*7`(%rsp),%xmm13
1183 movaps `$framesz+16*8`(%rsp),%xmm14
1184 movaps `$framesz+16*9`(%rsp),%xmm15
1196 .size ${func}_avx2,.-${func}_avx2
1201 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1202 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1209 $code.=<<___ if ($avx);
1210 .extern __imp_RtlVirtualUnwind
1211 .type se_handler,\@abi-omnipotent
1225 mov 120($context),%rax # pull context->Rax
1226 mov 248($context),%rbx # pull context->Rip
1228 mov 8($disp),%rsi # disp->ImageBase
1229 mov 56($disp),%r11 # disp->HanderlData
1231 mov 0(%r11),%r10d # HandlerData[0]
1232 lea (%rsi,%r10),%r10 # prologue label
1233 cmp %r10,%rbx # context->Rip<prologue label
1236 mov 152($context),%rax # pull context->Rsp
1238 mov 4(%r11),%r10d # HandlerData[1]
1239 lea (%rsi,%r10),%r10 # epilogue label
1240 cmp %r10,%rbx # context->Rip>=epilogue label
1243 $code.=<<___ if ($avx>1);
1244 lea .Lavx2_shortcut(%rip),%r10
1245 cmp %r10,%rbx # context->Rip<avx2_shortcut
1249 add \$`2*$SZ*($rounds-8)`,%rax
1253 mov %rax,%rsi # put aside Rsp
1254 mov 16*$SZ+7*8(%rax),%rax # pull $_rsp
1263 mov %rbx,144($context) # restore context->Rbx
1264 mov %rbp,160($context) # restore context->Rbp
1265 mov %r12,216($context) # restore context->R12
1266 mov %r13,224($context) # restore context->R13
1267 mov %r14,232($context) # restore context->R14
1268 mov %r15,240($context) # restore context->R15
1270 lea .Lepilogue(%rip),%r10
1272 jb .Lin_prologue # non-AVX code
1274 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area
1275 lea 512($context),%rdi # &context.Xmm6
1277 .long 0xa548f3fc # cld; rep movsq
1282 mov %rax,152($context) # restore context->Rsp
1283 mov %rsi,168($context) # restore context->Rsi
1284 mov %rdi,176($context) # restore context->Rdi
1286 mov 40($disp),%rdi # disp->ContextRecord
1287 mov $context,%rsi # context
1288 mov \$154,%ecx # sizeof(CONTEXT)
1289 .long 0xa548f3fc # cld; rep movsq
1292 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1293 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1294 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1295 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1296 mov 40(%rsi),%r10 # disp->ContextRecord
1297 lea 56(%rsi),%r11 # &disp->HandlerData
1298 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1299 mov %r10,32(%rsp) # arg5
1300 mov %r11,40(%rsp) # arg6
1301 mov %r12,48(%rsp) # arg7
1302 mov %rcx,56(%rsp) # arg8, (NULL)
1303 call *__imp_RtlVirtualUnwind(%rip)
1305 mov \$1,%eax # ExceptionContinueSearch
1317 .size se_handler,.-se_handler
1320 .rva .LSEH_begin_${func}_xop
1321 .rva .LSEH_end_${func}_xop
1322 .rva .LSEH_info_${func}_xop
1324 .rva .LSEH_begin_${func}_avx
1325 .rva .LSEH_end_${func}_avx
1326 .rva .LSEH_info_${func}_avx
1328 $code.=<<___ if ($avx>1);
1329 .rva .LSEH_begin_${func}_avx2
1330 .rva .LSEH_end_${func}_avx2
1331 .rva .LSEH_info_${func}_avx2
1333 $code.=<<___ if ($avx);
1336 .LSEH_info_${func}_xop:
1339 .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
1341 .LSEH_info_${func}_avx:
1344 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
1346 $code.=<<___ if ($avx>1);
1347 .LSEH_info_${func}_avx2:
1350 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
1354 $code =~ s/\`([^\`]*)\`/eval $1/gem;