2 # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # Multi-buffer AES-NI procedures process several independent buffers
18 # in parallel by interleaving independent instructions.
20 # Cycles per byte for interleave factor 4:
23 # ---------------------------
24 # Westmere 5.00/4=1.25 5.13/4=1.28
25 # Atom 15.0/4=3.75 ?15.7/4=3.93
26 # Sandy Bridge 5.06/4=1.27 5.18/4=1.29
27 # Ivy Bridge 5.06/4=1.27 5.14/4=1.29
28 # Haswell 4.44/4=1.11 4.44/4=1.11
29 # Bulldozer 5.75/4=1.44 5.76/4=1.44
31 # Cycles per byte for interleave factor 8 (not implemented for
32 # pre-AVX processors, where higher interleave factor incidentally
33 # doesn't result in improvement):
36 # ---------------------------
37 # Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*)
38 # Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*)
39 # Haswell 5.00/8=0.63 5.00/8=0.63
40 # Bulldozer 5.75/8=0.72 5.77/8=0.72
42 # (*) Sandy/Ivy Bridge are known to handle high interleave factors
45 # $output is the last argument if it looks like a file (it has an extension)
46 # $flavour is the first argument if it doesn't look like a file
47 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
48 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
50 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
52 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
53 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
54 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
55 die "can't locate x86_64-xlate.pl";
57 push(@INC,"${dir}","${dir}../../perlasm");
58 require "x86_64-support.pl";
60 $ptr_size=&pointer_size($flavour);
64 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
65 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
66 $avx = ($1>=2.19) + ($1>=2.22);
69 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
70 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
71 $avx = ($1>=2.09) + ($1>=2.10);
74 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
75 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
76 $avx = ($1>=10) + ($1>=11);
79 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
80 $avx = ($2>=3.0) + ($2>3.0);
83 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
84 or die "can't call $xlate: $!";
87 # void aesni_multi_cbc_encrypt (
88 # struct { void *inp,*out; int blocks; double iv[2]; } inp[8];
90 # int num); /* 1 or 2 */
92 $inp="%rdi"; # 1st arg
93 $key="%rsi"; # 2nd arg
96 $inp_elm_size=2*$ptr_size+8+16;
98 @inptr=map("%r$_",(8..11));
99 @outptr=map("%r$_",(12..15));
101 ($rndkey0,$rndkey1)=("%xmm0","%xmm1");
102 @out=map("%xmm$_",(2..5));
103 @inp=map("%xmm$_",(6..9));
104 ($counters,$mask,$zero)=map("%xmm$_",(10..12));
106 ($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx");
111 .extern OPENSSL_ia32cap_P
113 .globl aesni_multi_cbc_encrypt
114 .type aesni_multi_cbc_encrypt,\@function,3
116 aesni_multi_cbc_encrypt:
119 $code.=<<___ if ($avx);
122 mov OPENSSL_ia32cap_P+4(%rip),%ecx
123 test \$`1<<28`,%ecx # AVX bit
124 jnz _avx_cbc_enc_shortcut
131 .cfi_def_cfa_register %rax
145 $code.=<<___ if ($win64);
148 movaps %xmm7,0x10(%rsp)
149 movaps %xmm8,0x20(%rsp)
150 movaps %xmm9,0x30(%rsp)
151 movaps %xmm10,0x40(%rsp)
152 movaps %xmm11,0x50(%rsp)
153 movaps %xmm12,0x60(%rsp)
154 movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
155 movaps %xmm14,-0x58(%rax)
156 movaps %xmm15,-0x48(%rax)
162 # +16 input sink [original %rsp and $num]
167 mov %rax,16(%rsp) # original %rsp
168 .cfi_cfa_expression %rsp+16,deref,+8
171 movdqu ($key),$zero # 0-round key
172 lea 0x78($key),$key # size optimization
173 lea $inp_elm_size*2($inp),$inp
176 mov $num,24(%rsp) # original $num
179 for($i=0;$i<4;$i++) {
180 $inptr_reg=&pointer_register($flavour,@inptr[$i]);
181 $outptr_reg=&pointer_register($flavour,@outptr[$i]);
183 # borrow $one for number of blocks
184 mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one
185 mov `$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg
187 mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg
188 cmovg $one,$num # find maximum
191 movdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@out[$i]
192 mov $one,`32+4*$i`(%rsp) # initialize counters
193 cmovle %rsp,@inptr[$i] # cancel input
200 movups 0x10-0x78($key),$rndkey1
202 movups 0x20-0x78($key),$rndkey0
204 mov 0xf0-0x78($key),$rounds
206 movdqu (@inptr[0]),@inp[0] # load inputs
208 movdqu (@inptr[1]),@inp[1]
210 movdqu (@inptr[2]),@inp[2]
212 movdqu (@inptr[3]),@inp[3]
215 movdqa 32(%rsp),$counters # load counters
222 lea 16(%rsp),$sink # sink pointer
223 mov \$1,$one # constant of 1
226 aesenc $rndkey1,@out[0]
227 prefetcht0 31(@inptr[0],$offset) # prefetch input
228 prefetcht0 31(@inptr[1],$offset)
229 aesenc $rndkey1,@out[1]
230 prefetcht0 31(@inptr[2],$offset)
231 prefetcht0 31(@inptr[2],$offset)
232 aesenc $rndkey1,@out[2]
233 aesenc $rndkey1,@out[3]
234 movups 0x30-0x78($key),$rndkey1
236 for($i=0;$i<4;$i++) {
237 my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
239 cmp `32+4*$i`(%rsp),$one
240 aesenc $rndkey,@out[0]
241 aesenc $rndkey,@out[1]
242 aesenc $rndkey,@out[2]
243 cmovge $sink,@inptr[$i] # cancel input
244 cmovg $sink,@outptr[$i] # sink output
245 aesenc $rndkey,@out[3]
246 movups `0x40+16*$i-0x78`($key),$rndkey
250 movdqa $counters,$mask
251 aesenc $rndkey0,@out[0]
252 prefetcht0 15(@outptr[0],$offset) # prefetch output
253 prefetcht0 15(@outptr[1],$offset)
254 aesenc $rndkey0,@out[1]
255 prefetcht0 15(@outptr[2],$offset)
256 prefetcht0 15(@outptr[3],$offset)
257 aesenc $rndkey0,@out[2]
258 aesenc $rndkey0,@out[3]
259 movups 0x80-0x78($key),$rndkey0
262 aesenc $rndkey1,@out[0]
264 movdqu -0x78($key),$zero # reload 0-round key
265 aesenc $rndkey1,@out[1]
266 paddd $mask,$counters # decrement counters
267 movdqa $counters,32(%rsp) # update counters
268 aesenc $rndkey1,@out[2]
269 aesenc $rndkey1,@out[3]
270 movups 0x90-0x78($key),$rndkey1
274 aesenc $rndkey0,@out[0]
275 aesenc $rndkey0,@out[1]
276 aesenc $rndkey0,@out[2]
277 aesenc $rndkey0,@out[3]
278 movups 0xa0-0x78($key),$rndkey0
282 aesenc $rndkey1,@out[0]
283 aesenc $rndkey1,@out[1]
284 aesenc $rndkey1,@out[2]
285 aesenc $rndkey1,@out[3]
286 movups 0xb0-0x78($key),$rndkey1
288 aesenc $rndkey0,@out[0]
289 aesenc $rndkey0,@out[1]
290 aesenc $rndkey0,@out[2]
291 aesenc $rndkey0,@out[3]
292 movups 0xc0-0x78($key),$rndkey0
296 aesenc $rndkey1,@out[0]
297 aesenc $rndkey1,@out[1]
298 aesenc $rndkey1,@out[2]
299 aesenc $rndkey1,@out[3]
300 movups 0xd0-0x78($key),$rndkey1
302 aesenc $rndkey0,@out[0]
303 aesenc $rndkey0,@out[1]
304 aesenc $rndkey0,@out[2]
305 aesenc $rndkey0,@out[3]
306 movups 0xe0-0x78($key),$rndkey0
311 aesenc $rndkey1,@out[0]
312 aesenc $rndkey1,@out[1]
313 aesenc $rndkey1,@out[2]
314 aesenc $rndkey1,@out[3]
315 movdqu (@inptr[0],$offset),@inp[0]
316 movdqu 0x10-0x78($key),$rndkey1
318 aesenclast $rndkey0,@out[0]
319 movdqu (@inptr[1],$offset),@inp[1]
321 aesenclast $rndkey0,@out[1]
322 movdqu (@inptr[2],$offset),@inp[2]
324 aesenclast $rndkey0,@out[2]
325 movdqu (@inptr[3],$offset),@inp[3]
327 aesenclast $rndkey0,@out[3]
328 movdqu 0x20-0x78($key),$rndkey0
331 movups @out[0],-16(@outptr[0],$offset)
333 movups @out[1],-16(@outptr[1],$offset)
335 movups @out[2],-16(@outptr[2],$offset)
337 movups @out[3],-16(@outptr[3],$offset)
343 mov 16(%rsp),%rax # original %rsp
347 #pxor @inp[0],@out[0]
348 #pxor @inp[1],@out[1]
350 #movdqu @out[0],`$inp_elm_size*0+2*$ptr_size+8-$inp_elm_size*2`($inp)
351 #pxor @inp[2],@out[2]
352 #movdqu @out[1],`$inp_elm_size*1+2*$ptr_size+8-$inp_elm_size*2`($inp)
353 #pxor @inp[3],@out[3]
354 #movdqu @out[2],`$inp_elm_size*2+2*$ptr_size+8-$inp_elm_size*2`($inp) # won't fix, let caller
355 #movdqu @out[3],`$inp_elm_size*3+2*$ptr_size+8-$inp_elm_size*2`($inp) # figure this out...
357 lea `$inp_elm_size*4`($inp),$inp
359 jnz .Lenc4x_loop_grande
363 $code.=<<___ if ($win64);
364 movaps -0xd8(%rax),%xmm6
365 movaps -0xc8(%rax),%xmm7
366 movaps -0xb8(%rax),%xmm8
367 movaps -0xa8(%rax),%xmm9
368 movaps -0x98(%rax),%xmm10
369 movaps -0x88(%rax),%xmm11
370 movaps -0x78(%rax),%xmm12
371 #movaps -0x68(%rax),%xmm13
372 #movaps -0x58(%rax),%xmm14
373 #movaps -0x48(%rax),%xmm15
389 .cfi_def_cfa_register %rsp
393 .size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
395 .globl aesni_multi_cbc_decrypt
396 .type aesni_multi_cbc_decrypt,\@function,3
398 aesni_multi_cbc_decrypt:
401 $code.=<<___ if ($avx);
404 mov OPENSSL_ia32cap_P+4(%rip),%ecx
405 test \$`1<<28`,%ecx # AVX bit
406 jnz _avx_cbc_dec_shortcut
413 .cfi_def_cfa_register %rax
427 $code.=<<___ if ($win64);
430 movaps %xmm7,0x10(%rsp)
431 movaps %xmm8,0x20(%rsp)
432 movaps %xmm9,0x30(%rsp)
433 movaps %xmm10,0x40(%rsp)
434 movaps %xmm11,0x50(%rsp)
435 movaps %xmm12,0x60(%rsp)
436 movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
437 movaps %xmm14,-0x58(%rax)
438 movaps %xmm15,-0x48(%rax)
444 # +16 input sink [original %rsp and $num]
449 mov %rax,16(%rsp) # original %rsp
450 .cfi_cfa_expression %rsp+16,deref,+8
453 movdqu ($key),$zero # 0-round key
454 lea 0x78($key),$key # size optimization
455 lea $inp_elm_size*2($inp),$inp
458 mov $num,24(%rsp) # original $num
461 for($i=0;$i<4;$i++) {
462 $inptr_reg=&pointer_register($flavour,@inptr[$i]);
463 $outptr_reg=&pointer_register($flavour,@outptr[$i]);
465 # borrow $one for number of blocks
466 mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one
467 mov `$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg
469 mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg
470 cmovg $one,$num # find maximum
473 movdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@inp[$i]
474 mov $one,`32+4*$i`(%rsp) # initialize counters
475 cmovle %rsp,@inptr[$i] # cancel input
482 movups 0x10-0x78($key),$rndkey1
483 movups 0x20-0x78($key),$rndkey0
484 mov 0xf0-0x78($key),$rounds
485 movdqu (@inptr[0]),@out[0] # load inputs
486 movdqu (@inptr[1]),@out[1]
488 movdqu (@inptr[2]),@out[2]
490 movdqu (@inptr[3]),@out[3]
493 movdqa 32(%rsp),$counters # load counters
500 lea 16(%rsp),$sink # sink pointer
501 mov \$1,$one # constant of 1
504 aesdec $rndkey1,@out[0]
505 prefetcht0 31(@inptr[0],$offset) # prefetch input
506 prefetcht0 31(@inptr[1],$offset)
507 aesdec $rndkey1,@out[1]
508 prefetcht0 31(@inptr[2],$offset)
509 prefetcht0 31(@inptr[3],$offset)
510 aesdec $rndkey1,@out[2]
511 aesdec $rndkey1,@out[3]
512 movups 0x30-0x78($key),$rndkey1
514 for($i=0;$i<4;$i++) {
515 my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
517 cmp `32+4*$i`(%rsp),$one
518 aesdec $rndkey,@out[0]
519 aesdec $rndkey,@out[1]
520 aesdec $rndkey,@out[2]
521 cmovge $sink,@inptr[$i] # cancel input
522 cmovg $sink,@outptr[$i] # sink output
523 aesdec $rndkey,@out[3]
524 movups `0x40+16*$i-0x78`($key),$rndkey
528 movdqa $counters,$mask
529 aesdec $rndkey0,@out[0]
530 prefetcht0 15(@outptr[0],$offset) # prefetch output
531 prefetcht0 15(@outptr[1],$offset)
532 aesdec $rndkey0,@out[1]
533 prefetcht0 15(@outptr[2],$offset)
534 prefetcht0 15(@outptr[3],$offset)
535 aesdec $rndkey0,@out[2]
536 aesdec $rndkey0,@out[3]
537 movups 0x80-0x78($key),$rndkey0
540 aesdec $rndkey1,@out[0]
542 movdqu -0x78($key),$zero # reload 0-round key
543 aesdec $rndkey1,@out[1]
544 paddd $mask,$counters # decrement counters
545 movdqa $counters,32(%rsp) # update counters
546 aesdec $rndkey1,@out[2]
547 aesdec $rndkey1,@out[3]
548 movups 0x90-0x78($key),$rndkey1
552 aesdec $rndkey0,@out[0]
553 aesdec $rndkey0,@out[1]
554 aesdec $rndkey0,@out[2]
555 aesdec $rndkey0,@out[3]
556 movups 0xa0-0x78($key),$rndkey0
560 aesdec $rndkey1,@out[0]
561 aesdec $rndkey1,@out[1]
562 aesdec $rndkey1,@out[2]
563 aesdec $rndkey1,@out[3]
564 movups 0xb0-0x78($key),$rndkey1
566 aesdec $rndkey0,@out[0]
567 aesdec $rndkey0,@out[1]
568 aesdec $rndkey0,@out[2]
569 aesdec $rndkey0,@out[3]
570 movups 0xc0-0x78($key),$rndkey0
574 aesdec $rndkey1,@out[0]
575 aesdec $rndkey1,@out[1]
576 aesdec $rndkey1,@out[2]
577 aesdec $rndkey1,@out[3]
578 movups 0xd0-0x78($key),$rndkey1
580 aesdec $rndkey0,@out[0]
581 aesdec $rndkey0,@out[1]
582 aesdec $rndkey0,@out[2]
583 aesdec $rndkey0,@out[3]
584 movups 0xe0-0x78($key),$rndkey0
589 aesdec $rndkey1,@out[0]
590 aesdec $rndkey1,@out[1]
591 aesdec $rndkey1,@out[2]
592 pxor $rndkey0,@inp[0]
593 pxor $rndkey0,@inp[1]
594 aesdec $rndkey1,@out[3]
595 movdqu 0x10-0x78($key),$rndkey1
596 pxor $rndkey0,@inp[2]
597 pxor $rndkey0,@inp[3]
598 movdqu 0x20-0x78($key),$rndkey0
600 aesdeclast @inp[0],@out[0]
601 aesdeclast @inp[1],@out[1]
602 movdqu -16(@inptr[0],$offset),@inp[0] # load next IV
603 movdqu -16(@inptr[1],$offset),@inp[1]
604 aesdeclast @inp[2],@out[2]
605 aesdeclast @inp[3],@out[3]
606 movdqu -16(@inptr[2],$offset),@inp[2]
607 movdqu -16(@inptr[3],$offset),@inp[3]
609 movups @out[0],-16(@outptr[0],$offset)
610 movdqu (@inptr[0],$offset),@out[0]
611 movups @out[1],-16(@outptr[1],$offset)
612 movdqu (@inptr[1],$offset),@out[1]
614 movups @out[2],-16(@outptr[2],$offset)
615 movdqu (@inptr[2],$offset),@out[2]
617 movups @out[3],-16(@outptr[3],$offset)
618 movdqu (@inptr[3],$offset),@out[3]
625 mov 16(%rsp),%rax # original %rsp
629 lea `$inp_elm_size*4`($inp),$inp
631 jnz .Ldec4x_loop_grande
635 $code.=<<___ if ($win64);
636 movaps -0xd8(%rax),%xmm6
637 movaps -0xc8(%rax),%xmm7
638 movaps -0xb8(%rax),%xmm8
639 movaps -0xa8(%rax),%xmm9
640 movaps -0x98(%rax),%xmm10
641 movaps -0x88(%rax),%xmm11
642 movaps -0x78(%rax),%xmm12
643 #movaps -0x68(%rax),%xmm13
644 #movaps -0x58(%rax),%xmm14
645 #movaps -0x48(%rax),%xmm15
661 .cfi_def_cfa_register %rsp
665 .size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
669 my @ptr=map("%r$_",(8..15));
672 my @out=map("%xmm$_",(2..9));
673 my @inp=map("%xmm$_",(10..13));
674 my ($counters,$zero)=("%xmm14","%xmm15");
677 .type aesni_multi_cbc_encrypt_avx,\@function,3
679 aesni_multi_cbc_encrypt_avx:
681 _avx_cbc_enc_shortcut:
683 .cfi_def_cfa_register %rax
697 $code.=<<___ if ($win64);
700 movaps %xmm7,0x10(%rsp)
701 movaps %xmm8,0x20(%rsp)
702 movaps %xmm9,0x30(%rsp)
703 movaps %xmm10,0x40(%rsp)
704 movaps %xmm11,0x50(%rsp)
705 movaps %xmm12,-0x78(%rax)
706 movaps %xmm13,-0x68(%rax)
707 movaps %xmm14,-0x58(%rax)
708 movaps %xmm15,-0x48(%rax)
714 # +16 input sink [original %rsp and $num]
716 # +64 distances between inputs and outputs
717 # +128 off-load area for @inp[0..3]
721 mov %rax,16(%rsp) # original %rsp
722 .cfi_cfa_expression %rsp+16,deref,+8
726 vmovdqu ($key),$zero # 0-round key
727 lea 0x78($key),$key # size optimization
728 lea `$inp_elm_size*4`($inp),$inp
732 #mov $num,24(%rsp) # original $num
735 for($i=0;$i<8;$i++) {
736 my $temp = $i ? $offload : $offset;
737 $ptr_reg=&pointer_register($flavour,@ptr[$i]);
738 $temp_reg=&pointer_register($flavour,$temp);
740 # borrow $one for number of blocks
741 mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one
743 mov `$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg
746 mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg
747 cmovg $one,$num # find maximum
750 vmovdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i]
751 mov $one,`32+4*$i`(%rsp) # initialize counters
752 cmovle %rsp,@ptr[$i] # cancel input
753 sub @ptr[$i],$temp # distance between input and output
754 mov $temp,`64+8*$i`(%rsp) # initialize distances
761 vmovups 0x10-0x78($key),$rndkey1
762 vmovups 0x20-0x78($key),$rndkey0
763 mov 0xf0-0x78($key),$rounds
765 vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round
766 lea 128(%rsp),$offload # offload area
767 vpxor (@ptr[1]),$zero,@inp[1]
768 vpxor (@ptr[2]),$zero,@inp[2]
769 vpxor (@ptr[3]),$zero,@inp[3]
770 vpxor @inp[0],@out[0],@out[0]
771 vpxor (@ptr[4]),$zero,@inp[0]
772 vpxor @inp[1],@out[1],@out[1]
773 vpxor (@ptr[5]),$zero,@inp[1]
774 vpxor @inp[2],@out[2],@out[2]
775 vpxor (@ptr[6]),$zero,@inp[2]
776 vpxor @inp[3],@out[3],@out[3]
777 vpxor (@ptr[7]),$zero,@inp[3]
778 vpxor @inp[0],@out[4],@out[4]
779 mov \$1,$one # constant of 1
780 vpxor @inp[1],@out[5],@out[5]
781 vpxor @inp[2],@out[6],@out[6]
782 vpxor @inp[3],@out[7],@out[7]
788 for($i=0;$i<8;$i++) {
789 my $rndkey=($i&1)?$rndkey0:$rndkey1;
791 vaesenc $rndkey,@out[0],@out[0]
792 cmp 32+4*$i(%rsp),$one
794 $code.=<<___ if ($i);
795 mov 64+8*$i(%rsp),$offset
798 vaesenc $rndkey,@out[1],@out[1]
799 prefetcht0 31(@ptr[$i]) # prefetch input
800 vaesenc $rndkey,@out[2],@out[2]
802 $code.=<<___ if ($i>1);
803 prefetcht0 15(@ptr[$i-2]) # prefetch output
806 vaesenc $rndkey,@out[3],@out[3]
807 lea (@ptr[$i],$offset),$offset
808 cmovge %rsp,@ptr[$i] # cancel input
809 vaesenc $rndkey,@out[4],@out[4]
810 cmovg %rsp,$offset # sink output
811 vaesenc $rndkey,@out[5],@out[5]
813 vaesenc $rndkey,@out[6],@out[6]
814 vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round
815 mov $offset,64+8*$i(%rsp)
816 vaesenc $rndkey,@out[7],@out[7]
817 vmovups `16*(3+$i)-0x78`($key),$rndkey
818 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
820 $code.=<<___ if ($i<4)
821 vmovdqu @inp[$i%4],`16*$i`($offload) # off-load
825 vmovdqu 32(%rsp),$counters
826 prefetcht0 15(@ptr[$i-2]) # prefetch output
827 prefetcht0 15(@ptr[$i-1])
831 vaesenc $rndkey1,@out[0],@out[0]
832 vaesenc $rndkey1,@out[1],@out[1]
833 vaesenc $rndkey1,@out[2],@out[2]
834 vaesenc $rndkey1,@out[3],@out[3]
835 vaesenc $rndkey1,@out[4],@out[4]
836 vaesenc $rndkey1,@out[5],@out[5]
837 vaesenc $rndkey1,@out[6],@out[6]
838 vaesenc $rndkey1,@out[7],@out[7]
839 vmovups 0xb0-0x78($key),$rndkey1
841 vaesenc $rndkey0,@out[0],@out[0]
842 vaesenc $rndkey0,@out[1],@out[1]
843 vaesenc $rndkey0,@out[2],@out[2]
844 vaesenc $rndkey0,@out[3],@out[3]
845 vaesenc $rndkey0,@out[4],@out[4]
846 vaesenc $rndkey0,@out[5],@out[5]
847 vaesenc $rndkey0,@out[6],@out[6]
848 vaesenc $rndkey0,@out[7],@out[7]
849 vmovups 0xc0-0x78($key),$rndkey0
852 vaesenc $rndkey1,@out[0],@out[0]
853 vaesenc $rndkey1,@out[1],@out[1]
854 vaesenc $rndkey1,@out[2],@out[2]
855 vaesenc $rndkey1,@out[3],@out[3]
856 vaesenc $rndkey1,@out[4],@out[4]
857 vaesenc $rndkey1,@out[5],@out[5]
858 vaesenc $rndkey1,@out[6],@out[6]
859 vaesenc $rndkey1,@out[7],@out[7]
860 vmovups 0xd0-0x78($key),$rndkey1
862 vaesenc $rndkey0,@out[0],@out[0]
863 vaesenc $rndkey0,@out[1],@out[1]
864 vaesenc $rndkey0,@out[2],@out[2]
865 vaesenc $rndkey0,@out[3],@out[3]
866 vaesenc $rndkey0,@out[4],@out[4]
867 vaesenc $rndkey0,@out[5],@out[5]
868 vaesenc $rndkey0,@out[6],@out[6]
869 vaesenc $rndkey0,@out[7],@out[7]
870 vmovups 0xe0-0x78($key),$rndkey0
873 vaesenc $rndkey1,@out[0],@out[0]
874 vpxor $zero,$zero,$zero
875 vaesenc $rndkey1,@out[1],@out[1]
876 vaesenc $rndkey1,@out[2],@out[2]
877 vpcmpgtd $zero,$counters,$zero
878 vaesenc $rndkey1,@out[3],@out[3]
879 vaesenc $rndkey1,@out[4],@out[4]
880 vpaddd $counters,$zero,$zero # decrement counters
881 vmovdqu 48(%rsp),$counters
882 vaesenc $rndkey1,@out[5],@out[5]
883 mov 64(%rsp),$offset # pre-load 1st offset
884 vaesenc $rndkey1,@out[6],@out[6]
885 vaesenc $rndkey1,@out[7],@out[7]
886 vmovups 0x10-0x78($key),$rndkey1
888 vaesenclast $rndkey0,@out[0],@out[0]
889 vmovdqa $zero,32(%rsp) # update counters
890 vpxor $zero,$zero,$zero
891 vaesenclast $rndkey0,@out[1],@out[1]
892 vaesenclast $rndkey0,@out[2],@out[2]
893 vpcmpgtd $zero,$counters,$zero
894 vaesenclast $rndkey0,@out[3],@out[3]
895 vaesenclast $rndkey0,@out[4],@out[4]
896 vpaddd $zero,$counters,$counters # decrement counters
897 vmovdqu -0x78($key),$zero # 0-round
898 vaesenclast $rndkey0,@out[5],@out[5]
899 vaesenclast $rndkey0,@out[6],@out[6]
900 vmovdqa $counters,48(%rsp) # update counters
901 vaesenclast $rndkey0,@out[7],@out[7]
902 vmovups 0x20-0x78($key),$rndkey0
904 vmovups @out[0],-16(@ptr[0]) # write output
905 sub $offset,@ptr[0] # switch to input
906 vpxor 0x00($offload),@out[0],@out[0]
907 vmovups @out[1],-16(@ptr[1])
908 sub `64+1*8`(%rsp),@ptr[1]
909 vpxor 0x10($offload),@out[1],@out[1]
910 vmovups @out[2],-16(@ptr[2])
911 sub `64+2*8`(%rsp),@ptr[2]
912 vpxor 0x20($offload),@out[2],@out[2]
913 vmovups @out[3],-16(@ptr[3])
914 sub `64+3*8`(%rsp),@ptr[3]
915 vpxor 0x30($offload),@out[3],@out[3]
916 vmovups @out[4],-16(@ptr[4])
917 sub `64+4*8`(%rsp),@ptr[4]
918 vpxor @inp[0],@out[4],@out[4]
919 vmovups @out[5],-16(@ptr[5])
920 sub `64+5*8`(%rsp),@ptr[5]
921 vpxor @inp[1],@out[5],@out[5]
922 vmovups @out[6],-16(@ptr[6])
923 sub `64+6*8`(%rsp),@ptr[6]
924 vpxor @inp[2],@out[6],@out[6]
925 vmovups @out[7],-16(@ptr[7])
926 sub `64+7*8`(%rsp),@ptr[7]
927 vpxor @inp[3],@out[7],@out[7]
932 mov 16(%rsp),%rax # original %rsp
935 #lea `$inp_elm_size*8`($inp),$inp
937 #jnz .Lenc8x_loop_grande
942 $code.=<<___ if ($win64);
943 movaps -0xd8(%rax),%xmm6
944 movaps -0xc8(%rax),%xmm7
945 movaps -0xb8(%rax),%xmm8
946 movaps -0xa8(%rax),%xmm9
947 movaps -0x98(%rax),%xmm10
948 movaps -0x88(%rax),%xmm11
949 movaps -0x78(%rax),%xmm12
950 movaps -0x68(%rax),%xmm13
951 movaps -0x58(%rax),%xmm14
952 movaps -0x48(%rax),%xmm15
968 .cfi_def_cfa_register %rsp
972 .size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
974 .type aesni_multi_cbc_decrypt_avx,\@function,3
976 aesni_multi_cbc_decrypt_avx:
978 _avx_cbc_dec_shortcut:
980 .cfi_def_cfa_register %rax
994 $code.=<<___ if ($win64);
997 movaps %xmm7,0x10(%rsp)
998 movaps %xmm8,0x20(%rsp)
999 movaps %xmm9,0x30(%rsp)
1000 movaps %xmm10,0x40(%rsp)
1001 movaps %xmm11,0x50(%rsp)
1002 movaps %xmm12,-0x78(%rax)
1003 movaps %xmm13,-0x68(%rax)
1004 movaps %xmm14,-0x58(%rax)
1005 movaps %xmm15,-0x48(%rax)
1011 # +16 input sink [original %rsp and $num]
1013 # +64 distances between inputs and outputs
1014 # +128 off-load area for @inp[0..3]
1015 # +192 IV/input offload
1020 mov %rax,16(%rsp) # original %rsp
1021 .cfi_cfa_expression %rsp+16,deref,+8
1025 vmovdqu ($key),$zero # 0-round key
1026 lea 0x78($key),$key # size optimization
1027 lea `$inp_elm_size*4`($inp),$inp
1030 .Ldec8x_loop_grande:
1031 #mov $num,24(%rsp) # original $num
1034 for($i=0;$i<8;$i++) {
1035 my $temp = $i ? $offload : $offset;
1036 $ptr_reg=&pointer_register($flavour,@ptr[$i]);
1037 $temp_reg=&pointer_register($flavour,$temp);
1039 # borrow $one for number of blocks
1040 mov `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one
1042 mov `$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg
1045 mov `$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg
1046 cmovg $one,$num # find maximum
1049 vmovdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i]
1050 mov $one,`32+4*$i`(%rsp) # initialize counters
1051 cmovle %rsp,@ptr[$i] # cancel input
1052 sub @ptr[$i],$temp # distance between input and output
1053 mov $temp,`64+8*$i`(%rsp) # initialize distances
1054 vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV
1061 vmovups 0x10-0x78($key),$rndkey1
1062 vmovups 0x20-0x78($key),$rndkey0
1063 mov 0xf0-0x78($key),$rounds
1064 lea 192+128(%rsp),$offload # offload area
1066 vmovdqu (@ptr[0]),@out[0] # load inputs
1067 vmovdqu (@ptr[1]),@out[1]
1068 vmovdqu (@ptr[2]),@out[2]
1069 vmovdqu (@ptr[3]),@out[3]
1070 vmovdqu (@ptr[4]),@out[4]
1071 vmovdqu (@ptr[5]),@out[5]
1072 vmovdqu (@ptr[6]),@out[6]
1073 vmovdqu (@ptr[7]),@out[7]
1074 vmovdqu @out[0],0x00($offload) # offload inputs
1075 vpxor $zero,@out[0],@out[0] # xor inputs with 0-round
1076 vmovdqu @out[1],0x10($offload)
1077 vpxor $zero,@out[1],@out[1]
1078 vmovdqu @out[2],0x20($offload)
1079 vpxor $zero,@out[2],@out[2]
1080 vmovdqu @out[3],0x30($offload)
1081 vpxor $zero,@out[3],@out[3]
1082 vmovdqu @out[4],0x40($offload)
1083 vpxor $zero,@out[4],@out[4]
1084 vmovdqu @out[5],0x50($offload)
1085 vpxor $zero,@out[5],@out[5]
1086 vmovdqu @out[6],0x60($offload)
1087 vpxor $zero,@out[6],@out[6]
1088 vmovdqu @out[7],0x70($offload)
1089 vpxor $zero,@out[7],@out[7]
1091 mov \$1,$one # constant of 1
1097 for($i=0;$i<8;$i++) {
1098 my $rndkey=($i&1)?$rndkey0:$rndkey1;
1100 vaesdec $rndkey,@out[0],@out[0]
1101 cmp 32+4*$i(%rsp),$one
1103 $code.=<<___ if ($i);
1104 mov 64+8*$i(%rsp),$offset
1107 vaesdec $rndkey,@out[1],@out[1]
1108 prefetcht0 31(@ptr[$i]) # prefetch input
1109 vaesdec $rndkey,@out[2],@out[2]
1111 $code.=<<___ if ($i>1);
1112 prefetcht0 15(@ptr[$i-2]) # prefetch output
1115 vaesdec $rndkey,@out[3],@out[3]
1116 lea (@ptr[$i],$offset),$offset
1117 cmovge %rsp,@ptr[$i] # cancel input
1118 vaesdec $rndkey,@out[4],@out[4]
1119 cmovg %rsp,$offset # sink output
1120 vaesdec $rndkey,@out[5],@out[5]
1121 sub @ptr[$i],$offset
1122 vaesdec $rndkey,@out[6],@out[6]
1123 vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input
1124 mov $offset,64+8*$i(%rsp)
1125 vaesdec $rndkey,@out[7],@out[7]
1126 vmovups `16*(3+$i)-0x78`($key),$rndkey
1127 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
1129 $code.=<<___ if ($i<4);
1130 vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load
1134 vmovdqu 32(%rsp),$counters
1135 prefetcht0 15(@ptr[$i-2]) # prefetch output
1136 prefetcht0 15(@ptr[$i-1])
1140 vaesdec $rndkey1,@out[0],@out[0]
1141 vaesdec $rndkey1,@out[1],@out[1]
1142 vaesdec $rndkey1,@out[2],@out[2]
1143 vaesdec $rndkey1,@out[3],@out[3]
1144 vaesdec $rndkey1,@out[4],@out[4]
1145 vaesdec $rndkey1,@out[5],@out[5]
1146 vaesdec $rndkey1,@out[6],@out[6]
1147 vaesdec $rndkey1,@out[7],@out[7]
1148 vmovups 0xb0-0x78($key),$rndkey1
1150 vaesdec $rndkey0,@out[0],@out[0]
1151 vaesdec $rndkey0,@out[1],@out[1]
1152 vaesdec $rndkey0,@out[2],@out[2]
1153 vaesdec $rndkey0,@out[3],@out[3]
1154 vaesdec $rndkey0,@out[4],@out[4]
1155 vaesdec $rndkey0,@out[5],@out[5]
1156 vaesdec $rndkey0,@out[6],@out[6]
1157 vaesdec $rndkey0,@out[7],@out[7]
1158 vmovups 0xc0-0x78($key),$rndkey0
1161 vaesdec $rndkey1,@out[0],@out[0]
1162 vaesdec $rndkey1,@out[1],@out[1]
1163 vaesdec $rndkey1,@out[2],@out[2]
1164 vaesdec $rndkey1,@out[3],@out[3]
1165 vaesdec $rndkey1,@out[4],@out[4]
1166 vaesdec $rndkey1,@out[5],@out[5]
1167 vaesdec $rndkey1,@out[6],@out[6]
1168 vaesdec $rndkey1,@out[7],@out[7]
1169 vmovups 0xd0-0x78($key),$rndkey1
1171 vaesdec $rndkey0,@out[0],@out[0]
1172 vaesdec $rndkey0,@out[1],@out[1]
1173 vaesdec $rndkey0,@out[2],@out[2]
1174 vaesdec $rndkey0,@out[3],@out[3]
1175 vaesdec $rndkey0,@out[4],@out[4]
1176 vaesdec $rndkey0,@out[5],@out[5]
1177 vaesdec $rndkey0,@out[6],@out[6]
1178 vaesdec $rndkey0,@out[7],@out[7]
1179 vmovups 0xe0-0x78($key),$rndkey0
1182 vaesdec $rndkey1,@out[0],@out[0]
1183 vpxor $zero,$zero,$zero
1184 vaesdec $rndkey1,@out[1],@out[1]
1185 vaesdec $rndkey1,@out[2],@out[2]
1186 vpcmpgtd $zero,$counters,$zero
1187 vaesdec $rndkey1,@out[3],@out[3]
1188 vaesdec $rndkey1,@out[4],@out[4]
1189 vpaddd $counters,$zero,$zero # decrement counters
1190 vmovdqu 48(%rsp),$counters
1191 vaesdec $rndkey1,@out[5],@out[5]
1192 mov 64(%rsp),$offset # pre-load 1st offset
1193 vaesdec $rndkey1,@out[6],@out[6]
1194 vaesdec $rndkey1,@out[7],@out[7]
1195 vmovups 0x10-0x78($key),$rndkey1
1197 vaesdeclast $rndkey0,@out[0],@out[0]
1198 vmovdqa $zero,32(%rsp) # update counters
1199 vpxor $zero,$zero,$zero
1200 vaesdeclast $rndkey0,@out[1],@out[1]
1201 vpxor 0x00($offload),@out[0],@out[0] # xor with IV
1202 vaesdeclast $rndkey0,@out[2],@out[2]
1203 vpxor 0x10($offload),@out[1],@out[1]
1204 vpcmpgtd $zero,$counters,$zero
1205 vaesdeclast $rndkey0,@out[3],@out[3]
1206 vpxor 0x20($offload),@out[2],@out[2]
1207 vaesdeclast $rndkey0,@out[4],@out[4]
1208 vpxor 0x30($offload),@out[3],@out[3]
1209 vpaddd $zero,$counters,$counters # decrement counters
1210 vmovdqu -0x78($key),$zero # 0-round
1211 vaesdeclast $rndkey0,@out[5],@out[5]
1212 vpxor 0x40($offload),@out[4],@out[4]
1213 vaesdeclast $rndkey0,@out[6],@out[6]
1214 vpxor 0x50($offload),@out[5],@out[5]
1215 vmovdqa $counters,48(%rsp) # update counters
1216 vaesdeclast $rndkey0,@out[7],@out[7]
1217 vpxor 0x60($offload),@out[6],@out[6]
1218 vmovups 0x20-0x78($key),$rndkey0
1220 vmovups @out[0],-16(@ptr[0]) # write output
1221 sub $offset,@ptr[0] # switch to input
1222 vmovdqu 128+0(%rsp),@out[0]
1223 vpxor 0x70($offload),@out[7],@out[7]
1224 vmovups @out[1],-16(@ptr[1])
1225 sub `64+1*8`(%rsp),@ptr[1]
1226 vmovdqu @out[0],0x00($offload)
1227 vpxor $zero,@out[0],@out[0]
1228 vmovdqu 128+16(%rsp),@out[1]
1229 vmovups @out[2],-16(@ptr[2])
1230 sub `64+2*8`(%rsp),@ptr[2]
1231 vmovdqu @out[1],0x10($offload)
1232 vpxor $zero,@out[1],@out[1]
1233 vmovdqu 128+32(%rsp),@out[2]
1234 vmovups @out[3],-16(@ptr[3])
1235 sub `64+3*8`(%rsp),@ptr[3]
1236 vmovdqu @out[2],0x20($offload)
1237 vpxor $zero,@out[2],@out[2]
1238 vmovdqu 128+48(%rsp),@out[3]
1239 vmovups @out[4],-16(@ptr[4])
1240 sub `64+4*8`(%rsp),@ptr[4]
1241 vmovdqu @out[3],0x30($offload)
1242 vpxor $zero,@out[3],@out[3]
1243 vmovdqu @inp[0],0x40($offload)
1244 vpxor @inp[0],$zero,@out[4]
1245 vmovups @out[5],-16(@ptr[5])
1246 sub `64+5*8`(%rsp),@ptr[5]
1247 vmovdqu @inp[1],0x50($offload)
1248 vpxor @inp[1],$zero,@out[5]
1249 vmovups @out[6],-16(@ptr[6])
1250 sub `64+6*8`(%rsp),@ptr[6]
1251 vmovdqu @inp[2],0x60($offload)
1252 vpxor @inp[2],$zero,@out[6]
1253 vmovups @out[7],-16(@ptr[7])
1254 sub `64+7*8`(%rsp),@ptr[7]
1255 vmovdqu @inp[3],0x70($offload)
1256 vpxor @inp[3],$zero,@out[7]
1262 mov 16(%rsp),%rax # original %rsp
1265 #lea `$inp_elm_size*8`($inp),$inp
1267 #jnz .Ldec8x_loop_grande
1272 $code.=<<___ if ($win64);
1273 movaps -0xd8(%rax),%xmm6
1274 movaps -0xc8(%rax),%xmm7
1275 movaps -0xb8(%rax),%xmm8
1276 movaps -0xa8(%rax),%xmm9
1277 movaps -0x98(%rax),%xmm10
1278 movaps -0x88(%rax),%xmm11
1279 movaps -0x78(%rax),%xmm12
1280 movaps -0x68(%rax),%xmm13
1281 movaps -0x58(%rax),%xmm14
1282 movaps -0x48(%rax),%xmm15
1298 .cfi_def_cfa_register %rsp
1302 .size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
1307 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1308 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1315 .extern __imp_RtlVirtualUnwind
1316 .type se_handler,\@abi-omnipotent
1330 mov 120($context),%rax # pull context->Rax
1331 mov 248($context),%rbx # pull context->Rip
1333 mov 8($disp),%rsi # disp->ImageBase
1334 mov 56($disp),%r11 # disp->HandlerData
1336 mov 0(%r11),%r10d # HandlerData[0]
1337 lea (%rsi,%r10),%r10 # prologue label
1338 cmp %r10,%rbx # context->Rip<.Lprologue
1341 mov 152($context),%rax # pull context->Rsp
1343 mov 4(%r11),%r10d # HandlerData[1]
1344 lea (%rsi,%r10),%r10 # epilogue label
1345 cmp %r10,%rbx # context->Rip>=.Lepilogue
1348 mov 16(%rax),%rax # pull saved stack pointer
1356 mov %rbx,144($context) # restore context->Rbx
1357 mov %rbp,160($context) # restore context->Rbp
1358 mov %r12,216($context) # restore context->R12
1359 mov %r13,224($context) # restore context->R13
1360 mov %r14,232($context) # restore context->R14
1361 mov %r15,240($context) # restore context->R15
1363 lea -56-10*16(%rax),%rsi
1364 lea 512($context),%rdi # &context.Xmm6
1366 .long 0xa548f3fc # cld; rep movsq
1371 mov %rax,152($context) # restore context->Rsp
1372 mov %rsi,168($context) # restore context->Rsi
1373 mov %rdi,176($context) # restore context->Rdi
1375 mov 40($disp),%rdi # disp->ContextRecord
1376 mov $context,%rsi # context
1377 mov \$154,%ecx # sizeof(CONTEXT)
1378 .long 0xa548f3fc # cld; rep movsq
1381 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1382 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1383 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1384 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1385 mov 40(%rsi),%r10 # disp->ContextRecord
1386 lea 56(%rsi),%r11 # &disp->HandlerData
1387 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1388 mov %r10,32(%rsp) # arg5
1389 mov %r11,40(%rsp) # arg6
1390 mov %r12,48(%rsp) # arg7
1391 mov %rcx,56(%rsp) # arg8, (NULL)
1392 call *__imp_RtlVirtualUnwind(%rip)
1394 mov \$1,%eax # ExceptionContinueSearch
1406 .size se_handler,.-se_handler
1410 .rva .LSEH_begin_aesni_multi_cbc_encrypt
1411 .rva .LSEH_end_aesni_multi_cbc_encrypt
1412 .rva .LSEH_info_aesni_multi_cbc_encrypt
1413 .rva .LSEH_begin_aesni_multi_cbc_decrypt
1414 .rva .LSEH_end_aesni_multi_cbc_decrypt
1415 .rva .LSEH_info_aesni_multi_cbc_decrypt
1417 $code.=<<___ if ($avx);
1418 .rva .LSEH_begin_aesni_multi_cbc_encrypt_avx
1419 .rva .LSEH_end_aesni_multi_cbc_encrypt_avx
1420 .rva .LSEH_info_aesni_multi_cbc_encrypt_avx
1421 .rva .LSEH_begin_aesni_multi_cbc_decrypt_avx
1422 .rva .LSEH_end_aesni_multi_cbc_decrypt_avx
1423 .rva .LSEH_info_aesni_multi_cbc_decrypt_avx
1428 .LSEH_info_aesni_multi_cbc_encrypt:
1431 .rva .Lenc4x_body,.Lenc4x_epilogue # HandlerData[]
1432 .LSEH_info_aesni_multi_cbc_decrypt:
1435 .rva .Ldec4x_body,.Ldec4x_epilogue # HandlerData[]
1437 $code.=<<___ if ($avx);
1438 .LSEH_info_aesni_multi_cbc_encrypt_avx:
1441 .rva .Lenc8x_body,.Lenc8x_epilogue # HandlerData[]
1442 .LSEH_info_aesni_multi_cbc_decrypt_avx:
1445 .rva .Ldec8x_body,.Ldec8x_epilogue # HandlerData[]
1448 ####################################################################
1451 local *opcode=shift;
1455 $rex|=0x04 if($dst>=8);
1456 $rex|=0x01 if($src>=8);
1457 push @opcode,$rex|0x40 if($rex);
1464 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1465 rex(\@opcode,$4,$3);
1466 push @opcode,0x0f,0x3a,0xdf;
1467 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
1469 push @opcode,$c=~/^0/?oct($c):$c;
1470 return ".byte\t".join(',',@opcode);
1472 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1475 "aesenc" => 0xdc, "aesenclast" => 0xdd,
1476 "aesdec" => 0xde, "aesdeclast" => 0xdf
1478 return undef if (!defined($opcodelet{$1}));
1479 rex(\@opcode,$3,$2);
1480 push @opcode,0x0f,0x38,$opcodelet{$1};
1481 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
1482 return ".byte\t".join(',',@opcode);
1484 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
1486 "aesenc" => 0xdc, "aesenclast" => 0xdd,
1487 "aesdec" => 0xde, "aesdeclast" => 0xdf
1489 return undef if (!defined($opcodelet{$1}));
1491 push @opcode,0x44 if ($3>=8);
1492 push @opcode,0x0f,0x38,$opcodelet{$1};
1493 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
1494 push @opcode,($off=~/^0/?oct($off):$off)&0xff;
1495 return ".byte\t".join(',',@opcode);
1500 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1501 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1504 close STDOUT or die "error closing STDOUT: $!";