2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # Multi-buffer AES-NI procedures process several independent buffers
18 # in parallel by interleaving independent instructions.
20 # Cycles per byte for interleave factor 4:
23 # ---------------------------
24 # Westmere 5.00/4=1.25 5.13/4=1.28
25 # Atom 15.0/4=3.75 ?15.7/4=3.93
26 # Sandy Bridge 5.06/4=1.27 5.18/4=1.29
27 # Ivy Bridge 5.06/4=1.27 5.14/4=1.29
28 # Haswell 4.44/4=1.11 4.44/4=1.11
29 # Bulldozer 5.75/4=1.44 5.76/4=1.44
31 # Cycles per byte for interleave factor 8 (not implemented for
32 # pre-AVX processors, where higher interleave factor incidentally
33 # doesn't result in improvement):
36 # ---------------------------
37 # Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*)
38 # Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*)
39 # Haswell 5.00/8=0.63 5.00/8=0.63
40 # Bulldozer 5.75/8=0.72 5.77/8=0.72
42 # (*) Sandy/Ivy Bridge are known to handle high interleave factors
45 # $output is the last argument if it looks like a file (it has an extension)
46 # $flavour is the first argument if it doesn't look like a file
47 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
48 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
50 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
52 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
53 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
54 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
55 die "can't locate x86_64-xlate.pl";
59 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
60 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
61 $avx = ($1>=2.19) + ($1>=2.22);
64 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
65 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
66 $avx = ($1>=2.09) + ($1>=2.10);
69 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
70 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
71 $avx = ($1>=10) + ($1>=11);
74 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
75 $avx = ($2>=3.0) + ($2>3.0);
78 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
79 or die "can't call $xlate: $!";
82 # void aesni_multi_cbc_encrypt (
83 # struct { void *inp,*out; int blocks; double iv[2]; } inp[8];
85 # int num); /* 1 or 2 */
87 $inp="%rdi"; # 1st arg
88 $key="%rsi"; # 2nd arg
91 @inptr=map("%r$_",(8..11));
92 @outptr=map("%r$_",(12..15));
94 ($rndkey0,$rndkey1)=("%xmm0","%xmm1");
95 @out=map("%xmm$_",(2..5));
96 @inp=map("%xmm$_",(6..9));
97 ($counters,$mask,$zero)=map("%xmm$_",(10..12));
99 ($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx");
104 .extern OPENSSL_ia32cap_P
106 .globl aesni_multi_cbc_encrypt
107 .type aesni_multi_cbc_encrypt,\@function,3
109 aesni_multi_cbc_encrypt:
112 $code.=<<___ if ($avx);
115 mov OPENSSL_ia32cap_P+4(%rip),%ecx
116 test \$`1<<28`,%ecx # AVX bit
117 jnz _avx_cbc_enc_shortcut
124 .cfi_def_cfa_register %rax
138 $code.=<<___ if ($win64);
141 movaps %xmm7,0x10(%rsp)
142 movaps %xmm8,0x20(%rsp)
143 movaps %xmm9,0x30(%rsp)
144 movaps %xmm10,0x40(%rsp)
145 movaps %xmm11,0x50(%rsp)
146 movaps %xmm12,0x60(%rsp)
147 movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
148 movaps %xmm14,-0x58(%rax)
149 movaps %xmm15,-0x48(%rax)
155 # +16 input sink [original %rsp and $num]
160 mov %rax,16(%rsp) # original %rsp
161 .cfi_cfa_expression %rsp+16,deref,+8
164 movdqu ($key),$zero # 0-round key
165 lea 0x78($key),$key # size optimization
169 mov $num,24(%rsp) # original $num
172 for($i=0;$i<4;$i++) {
174 mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
175 mov `40*$i+0-40*2`($inp),@inptr[$i]
177 mov `40*$i+8-40*2`($inp),@outptr[$i]
178 cmovg $one,$num # find maximum
180 movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV
181 mov $one,`32+4*$i`(%rsp) # initialize counters
182 cmovle %rsp,@inptr[$i] # cancel input
189 movups 0x10-0x78($key),$rndkey1
191 movups 0x20-0x78($key),$rndkey0
193 mov 0xf0-0x78($key),$rounds
195 movdqu (@inptr[0]),@inp[0] # load inputs
197 movdqu (@inptr[1]),@inp[1]
199 movdqu (@inptr[2]),@inp[2]
201 movdqu (@inptr[3]),@inp[3]
204 movdqa 32(%rsp),$counters # load counters
211 lea 16(%rsp),$sink # sink pointer
212 mov \$1,$one # constant of 1
215 aesenc $rndkey1,@out[0]
216 prefetcht0 31(@inptr[0],$offset) # prefetch input
217 prefetcht0 31(@inptr[1],$offset)
218 aesenc $rndkey1,@out[1]
219 prefetcht0 31(@inptr[2],$offset)
220 prefetcht0 31(@inptr[2],$offset)
221 aesenc $rndkey1,@out[2]
222 aesenc $rndkey1,@out[3]
223 movups 0x30-0x78($key),$rndkey1
225 for($i=0;$i<4;$i++) {
226 my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
228 cmp `32+4*$i`(%rsp),$one
229 aesenc $rndkey,@out[0]
230 aesenc $rndkey,@out[1]
231 aesenc $rndkey,@out[2]
232 cmovge $sink,@inptr[$i] # cancel input
233 cmovg $sink,@outptr[$i] # sink output
234 aesenc $rndkey,@out[3]
235 movups `0x40+16*$i-0x78`($key),$rndkey
239 movdqa $counters,$mask
240 aesenc $rndkey0,@out[0]
241 prefetcht0 15(@outptr[0],$offset) # prefetch output
242 prefetcht0 15(@outptr[1],$offset)
243 aesenc $rndkey0,@out[1]
244 prefetcht0 15(@outptr[2],$offset)
245 prefetcht0 15(@outptr[3],$offset)
246 aesenc $rndkey0,@out[2]
247 aesenc $rndkey0,@out[3]
248 movups 0x80-0x78($key),$rndkey0
251 aesenc $rndkey1,@out[0]
253 movdqu -0x78($key),$zero # reload 0-round key
254 aesenc $rndkey1,@out[1]
255 paddd $mask,$counters # decrement counters
256 movdqa $counters,32(%rsp) # update counters
257 aesenc $rndkey1,@out[2]
258 aesenc $rndkey1,@out[3]
259 movups 0x90-0x78($key),$rndkey1
263 aesenc $rndkey0,@out[0]
264 aesenc $rndkey0,@out[1]
265 aesenc $rndkey0,@out[2]
266 aesenc $rndkey0,@out[3]
267 movups 0xa0-0x78($key),$rndkey0
271 aesenc $rndkey1,@out[0]
272 aesenc $rndkey1,@out[1]
273 aesenc $rndkey1,@out[2]
274 aesenc $rndkey1,@out[3]
275 movups 0xb0-0x78($key),$rndkey1
277 aesenc $rndkey0,@out[0]
278 aesenc $rndkey0,@out[1]
279 aesenc $rndkey0,@out[2]
280 aesenc $rndkey0,@out[3]
281 movups 0xc0-0x78($key),$rndkey0
285 aesenc $rndkey1,@out[0]
286 aesenc $rndkey1,@out[1]
287 aesenc $rndkey1,@out[2]
288 aesenc $rndkey1,@out[3]
289 movups 0xd0-0x78($key),$rndkey1
291 aesenc $rndkey0,@out[0]
292 aesenc $rndkey0,@out[1]
293 aesenc $rndkey0,@out[2]
294 aesenc $rndkey0,@out[3]
295 movups 0xe0-0x78($key),$rndkey0
300 aesenc $rndkey1,@out[0]
301 aesenc $rndkey1,@out[1]
302 aesenc $rndkey1,@out[2]
303 aesenc $rndkey1,@out[3]
304 movdqu (@inptr[0],$offset),@inp[0]
305 movdqu 0x10-0x78($key),$rndkey1
307 aesenclast $rndkey0,@out[0]
308 movdqu (@inptr[1],$offset),@inp[1]
310 aesenclast $rndkey0,@out[1]
311 movdqu (@inptr[2],$offset),@inp[2]
313 aesenclast $rndkey0,@out[2]
314 movdqu (@inptr[3],$offset),@inp[3]
316 aesenclast $rndkey0,@out[3]
317 movdqu 0x20-0x78($key),$rndkey0
320 movups @out[0],-16(@outptr[0],$offset)
322 movups @out[1],-16(@outptr[1],$offset)
324 movups @out[2],-16(@outptr[2],$offset)
326 movups @out[3],-16(@outptr[3],$offset)
332 mov 16(%rsp),%rax # original %rsp
336 #pxor @inp[0],@out[0]
337 #pxor @inp[1],@out[1]
338 #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME!
339 #pxor @inp[2],@out[2]
340 #movdqu @out[1],`40*1+24-40*2`($inp)
341 #pxor @inp[3],@out[3]
342 #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller
343 #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out...
345 lea `40*4`($inp),$inp
347 jnz .Lenc4x_loop_grande
351 $code.=<<___ if ($win64);
352 movaps -0xd8(%rax),%xmm6
353 movaps -0xc8(%rax),%xmm7
354 movaps -0xb8(%rax),%xmm8
355 movaps -0xa8(%rax),%xmm9
356 movaps -0x98(%rax),%xmm10
357 movaps -0x88(%rax),%xmm11
358 movaps -0x78(%rax),%xmm12
359 #movaps -0x68(%rax),%xmm13
360 #movaps -0x58(%rax),%xmm14
361 #movaps -0x48(%rax),%xmm15
377 .cfi_def_cfa_register %rsp
381 .size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
383 .globl aesni_multi_cbc_decrypt
384 .type aesni_multi_cbc_decrypt,\@function,3
386 aesni_multi_cbc_decrypt:
389 $code.=<<___ if ($avx);
392 mov OPENSSL_ia32cap_P+4(%rip),%ecx
393 test \$`1<<28`,%ecx # AVX bit
394 jnz _avx_cbc_dec_shortcut
401 .cfi_def_cfa_register %rax
415 $code.=<<___ if ($win64);
418 movaps %xmm7,0x10(%rsp)
419 movaps %xmm8,0x20(%rsp)
420 movaps %xmm9,0x30(%rsp)
421 movaps %xmm10,0x40(%rsp)
422 movaps %xmm11,0x50(%rsp)
423 movaps %xmm12,0x60(%rsp)
424 movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
425 movaps %xmm14,-0x58(%rax)
426 movaps %xmm15,-0x48(%rax)
432 # +16 input sink [original %rsp and $num]
437 mov %rax,16(%rsp) # original %rsp
438 .cfi_cfa_expression %rsp+16,deref,+8
441 movdqu ($key),$zero # 0-round key
442 lea 0x78($key),$key # size optimization
446 mov $num,24(%rsp) # original $num
449 for($i=0;$i<4;$i++) {
451 mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
452 mov `40*$i+0-40*2`($inp),@inptr[$i]
454 mov `40*$i+8-40*2`($inp),@outptr[$i]
455 cmovg $one,$num # find maximum
457 movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV
458 mov $one,`32+4*$i`(%rsp) # initialize counters
459 cmovle %rsp,@inptr[$i] # cancel input
466 movups 0x10-0x78($key),$rndkey1
467 movups 0x20-0x78($key),$rndkey0
468 mov 0xf0-0x78($key),$rounds
469 movdqu (@inptr[0]),@out[0] # load inputs
470 movdqu (@inptr[1]),@out[1]
472 movdqu (@inptr[2]),@out[2]
474 movdqu (@inptr[3]),@out[3]
477 movdqa 32(%rsp),$counters # load counters
484 lea 16(%rsp),$sink # sink pointer
485 mov \$1,$one # constant of 1
488 aesdec $rndkey1,@out[0]
489 prefetcht0 31(@inptr[0],$offset) # prefetch input
490 prefetcht0 31(@inptr[1],$offset)
491 aesdec $rndkey1,@out[1]
492 prefetcht0 31(@inptr[2],$offset)
493 prefetcht0 31(@inptr[3],$offset)
494 aesdec $rndkey1,@out[2]
495 aesdec $rndkey1,@out[3]
496 movups 0x30-0x78($key),$rndkey1
498 for($i=0;$i<4;$i++) {
499 my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
501 cmp `32+4*$i`(%rsp),$one
502 aesdec $rndkey,@out[0]
503 aesdec $rndkey,@out[1]
504 aesdec $rndkey,@out[2]
505 cmovge $sink,@inptr[$i] # cancel input
506 cmovg $sink,@outptr[$i] # sink output
507 aesdec $rndkey,@out[3]
508 movups `0x40+16*$i-0x78`($key),$rndkey
512 movdqa $counters,$mask
513 aesdec $rndkey0,@out[0]
514 prefetcht0 15(@outptr[0],$offset) # prefetch output
515 prefetcht0 15(@outptr[1],$offset)
516 aesdec $rndkey0,@out[1]
517 prefetcht0 15(@outptr[2],$offset)
518 prefetcht0 15(@outptr[3],$offset)
519 aesdec $rndkey0,@out[2]
520 aesdec $rndkey0,@out[3]
521 movups 0x80-0x78($key),$rndkey0
524 aesdec $rndkey1,@out[0]
526 movdqu -0x78($key),$zero # reload 0-round key
527 aesdec $rndkey1,@out[1]
528 paddd $mask,$counters # decrement counters
529 movdqa $counters,32(%rsp) # update counters
530 aesdec $rndkey1,@out[2]
531 aesdec $rndkey1,@out[3]
532 movups 0x90-0x78($key),$rndkey1
536 aesdec $rndkey0,@out[0]
537 aesdec $rndkey0,@out[1]
538 aesdec $rndkey0,@out[2]
539 aesdec $rndkey0,@out[3]
540 movups 0xa0-0x78($key),$rndkey0
544 aesdec $rndkey1,@out[0]
545 aesdec $rndkey1,@out[1]
546 aesdec $rndkey1,@out[2]
547 aesdec $rndkey1,@out[3]
548 movups 0xb0-0x78($key),$rndkey1
550 aesdec $rndkey0,@out[0]
551 aesdec $rndkey0,@out[1]
552 aesdec $rndkey0,@out[2]
553 aesdec $rndkey0,@out[3]
554 movups 0xc0-0x78($key),$rndkey0
558 aesdec $rndkey1,@out[0]
559 aesdec $rndkey1,@out[1]
560 aesdec $rndkey1,@out[2]
561 aesdec $rndkey1,@out[3]
562 movups 0xd0-0x78($key),$rndkey1
564 aesdec $rndkey0,@out[0]
565 aesdec $rndkey0,@out[1]
566 aesdec $rndkey0,@out[2]
567 aesdec $rndkey0,@out[3]
568 movups 0xe0-0x78($key),$rndkey0
573 aesdec $rndkey1,@out[0]
574 aesdec $rndkey1,@out[1]
575 aesdec $rndkey1,@out[2]
576 pxor $rndkey0,@inp[0]
577 pxor $rndkey0,@inp[1]
578 aesdec $rndkey1,@out[3]
579 movdqu 0x10-0x78($key),$rndkey1
580 pxor $rndkey0,@inp[2]
581 pxor $rndkey0,@inp[3]
582 movdqu 0x20-0x78($key),$rndkey0
584 aesdeclast @inp[0],@out[0]
585 aesdeclast @inp[1],@out[1]
586 movdqu -16(@inptr[0],$offset),@inp[0] # load next IV
587 movdqu -16(@inptr[1],$offset),@inp[1]
588 aesdeclast @inp[2],@out[2]
589 aesdeclast @inp[3],@out[3]
590 movdqu -16(@inptr[2],$offset),@inp[2]
591 movdqu -16(@inptr[3],$offset),@inp[3]
593 movups @out[0],-16(@outptr[0],$offset)
594 movdqu (@inptr[0],$offset),@out[0]
595 movups @out[1],-16(@outptr[1],$offset)
596 movdqu (@inptr[1],$offset),@out[1]
598 movups @out[2],-16(@outptr[2],$offset)
599 movdqu (@inptr[2],$offset),@out[2]
601 movups @out[3],-16(@outptr[3],$offset)
602 movdqu (@inptr[3],$offset),@out[3]
609 mov 16(%rsp),%rax # original %rsp
613 lea `40*4`($inp),$inp
615 jnz .Ldec4x_loop_grande
619 $code.=<<___ if ($win64);
620 movaps -0xd8(%rax),%xmm6
621 movaps -0xc8(%rax),%xmm7
622 movaps -0xb8(%rax),%xmm8
623 movaps -0xa8(%rax),%xmm9
624 movaps -0x98(%rax),%xmm10
625 movaps -0x88(%rax),%xmm11
626 movaps -0x78(%rax),%xmm12
627 #movaps -0x68(%rax),%xmm13
628 #movaps -0x58(%rax),%xmm14
629 #movaps -0x48(%rax),%xmm15
645 .cfi_def_cfa_register %rsp
649 .size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
653 my @ptr=map("%r$_",(8..15));
656 my @out=map("%xmm$_",(2..9));
657 my @inp=map("%xmm$_",(10..13));
658 my ($counters,$zero)=("%xmm14","%xmm15");
661 .type aesni_multi_cbc_encrypt_avx,\@function,3
663 aesni_multi_cbc_encrypt_avx:
665 _avx_cbc_enc_shortcut:
667 .cfi_def_cfa_register %rax
681 $code.=<<___ if ($win64);
684 movaps %xmm7,0x10(%rsp)
685 movaps %xmm8,0x20(%rsp)
686 movaps %xmm9,0x30(%rsp)
687 movaps %xmm10,0x40(%rsp)
688 movaps %xmm11,0x50(%rsp)
689 movaps %xmm12,-0x78(%rax)
690 movaps %xmm13,-0x68(%rax)
691 movaps %xmm14,-0x58(%rax)
692 movaps %xmm15,-0x48(%rax)
698 # +16 input sink [original %rsp and $num]
700 # +64 distances between inputs and outputs
701 # +128 off-load area for @inp[0..3]
705 mov %rax,16(%rsp) # original %rsp
706 .cfi_cfa_expression %rsp+16,deref,+8
710 vmovdqu ($key),$zero # 0-round key
711 lea 0x78($key),$key # size optimization
716 #mov $num,24(%rsp) # original $num
719 for($i=0;$i<8;$i++) {
720 my $temp = $i ? $offload : $offset;
722 mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
723 mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
725 mov `40*$i+8-40*4`($inp),$temp # output pointer
726 cmovg $one,$num # find maximum
728 vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
729 mov $one,`32+4*$i`(%rsp) # initialize counters
730 cmovle %rsp,@ptr[$i] # cancel input
731 sub @ptr[$i],$temp # distance between input and output
732 mov $temp,`64+8*$i`(%rsp) # initialize distances
739 vmovups 0x10-0x78($key),$rndkey1
740 vmovups 0x20-0x78($key),$rndkey0
741 mov 0xf0-0x78($key),$rounds
743 vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round
744 lea 128(%rsp),$offload # offload area
745 vpxor (@ptr[1]),$zero,@inp[1]
746 vpxor (@ptr[2]),$zero,@inp[2]
747 vpxor (@ptr[3]),$zero,@inp[3]
748 vpxor @inp[0],@out[0],@out[0]
749 vpxor (@ptr[4]),$zero,@inp[0]
750 vpxor @inp[1],@out[1],@out[1]
751 vpxor (@ptr[5]),$zero,@inp[1]
752 vpxor @inp[2],@out[2],@out[2]
753 vpxor (@ptr[6]),$zero,@inp[2]
754 vpxor @inp[3],@out[3],@out[3]
755 vpxor (@ptr[7]),$zero,@inp[3]
756 vpxor @inp[0],@out[4],@out[4]
757 mov \$1,$one # constant of 1
758 vpxor @inp[1],@out[5],@out[5]
759 vpxor @inp[2],@out[6],@out[6]
760 vpxor @inp[3],@out[7],@out[7]
766 for($i=0;$i<8;$i++) {
767 my $rndkey=($i&1)?$rndkey0:$rndkey1;
769 vaesenc $rndkey,@out[0],@out[0]
770 cmp 32+4*$i(%rsp),$one
772 $code.=<<___ if ($i);
773 mov 64+8*$i(%rsp),$offset
776 vaesenc $rndkey,@out[1],@out[1]
777 prefetcht0 31(@ptr[$i]) # prefetch input
778 vaesenc $rndkey,@out[2],@out[2]
780 $code.=<<___ if ($i>1);
781 prefetcht0 15(@ptr[$i-2]) # prefetch output
784 vaesenc $rndkey,@out[3],@out[3]
785 lea (@ptr[$i],$offset),$offset
786 cmovge %rsp,@ptr[$i] # cancel input
787 vaesenc $rndkey,@out[4],@out[4]
788 cmovg %rsp,$offset # sink output
789 vaesenc $rndkey,@out[5],@out[5]
791 vaesenc $rndkey,@out[6],@out[6]
792 vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round
793 mov $offset,64+8*$i(%rsp)
794 vaesenc $rndkey,@out[7],@out[7]
795 vmovups `16*(3+$i)-0x78`($key),$rndkey
796 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
798 $code.=<<___ if ($i<4)
799 vmovdqu @inp[$i%4],`16*$i`($offload) # off-load
803 vmovdqu 32(%rsp),$counters
804 prefetcht0 15(@ptr[$i-2]) # prefetch output
805 prefetcht0 15(@ptr[$i-1])
809 vaesenc $rndkey1,@out[0],@out[0]
810 vaesenc $rndkey1,@out[1],@out[1]
811 vaesenc $rndkey1,@out[2],@out[2]
812 vaesenc $rndkey1,@out[3],@out[3]
813 vaesenc $rndkey1,@out[4],@out[4]
814 vaesenc $rndkey1,@out[5],@out[5]
815 vaesenc $rndkey1,@out[6],@out[6]
816 vaesenc $rndkey1,@out[7],@out[7]
817 vmovups 0xb0-0x78($key),$rndkey1
819 vaesenc $rndkey0,@out[0],@out[0]
820 vaesenc $rndkey0,@out[1],@out[1]
821 vaesenc $rndkey0,@out[2],@out[2]
822 vaesenc $rndkey0,@out[3],@out[3]
823 vaesenc $rndkey0,@out[4],@out[4]
824 vaesenc $rndkey0,@out[5],@out[5]
825 vaesenc $rndkey0,@out[6],@out[6]
826 vaesenc $rndkey0,@out[7],@out[7]
827 vmovups 0xc0-0x78($key),$rndkey0
830 vaesenc $rndkey1,@out[0],@out[0]
831 vaesenc $rndkey1,@out[1],@out[1]
832 vaesenc $rndkey1,@out[2],@out[2]
833 vaesenc $rndkey1,@out[3],@out[3]
834 vaesenc $rndkey1,@out[4],@out[4]
835 vaesenc $rndkey1,@out[5],@out[5]
836 vaesenc $rndkey1,@out[6],@out[6]
837 vaesenc $rndkey1,@out[7],@out[7]
838 vmovups 0xd0-0x78($key),$rndkey1
840 vaesenc $rndkey0,@out[0],@out[0]
841 vaesenc $rndkey0,@out[1],@out[1]
842 vaesenc $rndkey0,@out[2],@out[2]
843 vaesenc $rndkey0,@out[3],@out[3]
844 vaesenc $rndkey0,@out[4],@out[4]
845 vaesenc $rndkey0,@out[5],@out[5]
846 vaesenc $rndkey0,@out[6],@out[6]
847 vaesenc $rndkey0,@out[7],@out[7]
848 vmovups 0xe0-0x78($key),$rndkey0
851 vaesenc $rndkey1,@out[0],@out[0]
852 vpxor $zero,$zero,$zero
853 vaesenc $rndkey1,@out[1],@out[1]
854 vaesenc $rndkey1,@out[2],@out[2]
855 vpcmpgtd $zero,$counters,$zero
856 vaesenc $rndkey1,@out[3],@out[3]
857 vaesenc $rndkey1,@out[4],@out[4]
858 vpaddd $counters,$zero,$zero # decrement counters
859 vmovdqu 48(%rsp),$counters
860 vaesenc $rndkey1,@out[5],@out[5]
861 mov 64(%rsp),$offset # pre-load 1st offset
862 vaesenc $rndkey1,@out[6],@out[6]
863 vaesenc $rndkey1,@out[7],@out[7]
864 vmovups 0x10-0x78($key),$rndkey1
866 vaesenclast $rndkey0,@out[0],@out[0]
867 vmovdqa $zero,32(%rsp) # update counters
868 vpxor $zero,$zero,$zero
869 vaesenclast $rndkey0,@out[1],@out[1]
870 vaesenclast $rndkey0,@out[2],@out[2]
871 vpcmpgtd $zero,$counters,$zero
872 vaesenclast $rndkey0,@out[3],@out[3]
873 vaesenclast $rndkey0,@out[4],@out[4]
874 vpaddd $zero,$counters,$counters # decrement counters
875 vmovdqu -0x78($key),$zero # 0-round
876 vaesenclast $rndkey0,@out[5],@out[5]
877 vaesenclast $rndkey0,@out[6],@out[6]
878 vmovdqa $counters,48(%rsp) # update counters
879 vaesenclast $rndkey0,@out[7],@out[7]
880 vmovups 0x20-0x78($key),$rndkey0
882 vmovups @out[0],-16(@ptr[0]) # write output
883 sub $offset,@ptr[0] # switch to input
884 vpxor 0x00($offload),@out[0],@out[0]
885 vmovups @out[1],-16(@ptr[1])
886 sub `64+1*8`(%rsp),@ptr[1]
887 vpxor 0x10($offload),@out[1],@out[1]
888 vmovups @out[2],-16(@ptr[2])
889 sub `64+2*8`(%rsp),@ptr[2]
890 vpxor 0x20($offload),@out[2],@out[2]
891 vmovups @out[3],-16(@ptr[3])
892 sub `64+3*8`(%rsp),@ptr[3]
893 vpxor 0x30($offload),@out[3],@out[3]
894 vmovups @out[4],-16(@ptr[4])
895 sub `64+4*8`(%rsp),@ptr[4]
896 vpxor @inp[0],@out[4],@out[4]
897 vmovups @out[5],-16(@ptr[5])
898 sub `64+5*8`(%rsp),@ptr[5]
899 vpxor @inp[1],@out[5],@out[5]
900 vmovups @out[6],-16(@ptr[6])
901 sub `64+6*8`(%rsp),@ptr[6]
902 vpxor @inp[2],@out[6],@out[6]
903 vmovups @out[7],-16(@ptr[7])
904 sub `64+7*8`(%rsp),@ptr[7]
905 vpxor @inp[3],@out[7],@out[7]
910 mov 16(%rsp),%rax # original %rsp
913 #lea `40*8`($inp),$inp
915 #jnz .Lenc8x_loop_grande
920 $code.=<<___ if ($win64);
921 movaps -0xd8(%rax),%xmm6
922 movaps -0xc8(%rax),%xmm7
923 movaps -0xb8(%rax),%xmm8
924 movaps -0xa8(%rax),%xmm9
925 movaps -0x98(%rax),%xmm10
926 movaps -0x88(%rax),%xmm11
927 movaps -0x78(%rax),%xmm12
928 movaps -0x68(%rax),%xmm13
929 movaps -0x58(%rax),%xmm14
930 movaps -0x48(%rax),%xmm15
946 .cfi_def_cfa_register %rsp
950 .size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
952 .type aesni_multi_cbc_decrypt_avx,\@function,3
954 aesni_multi_cbc_decrypt_avx:
956 _avx_cbc_dec_shortcut:
958 .cfi_def_cfa_register %rax
972 $code.=<<___ if ($win64);
975 movaps %xmm7,0x10(%rsp)
976 movaps %xmm8,0x20(%rsp)
977 movaps %xmm9,0x30(%rsp)
978 movaps %xmm10,0x40(%rsp)
979 movaps %xmm11,0x50(%rsp)
980 movaps %xmm12,-0x78(%rax)
981 movaps %xmm13,-0x68(%rax)
982 movaps %xmm14,-0x58(%rax)
983 movaps %xmm15,-0x48(%rax)
989 # +16 input sink [original %rsp and $num]
991 # +64 distances between inputs and outputs
992 # +128 off-load area for @inp[0..3]
993 # +192 IV/input offload
998 mov %rax,16(%rsp) # original %rsp
999 .cfi_cfa_expression %rsp+16,deref,+8
1003 vmovdqu ($key),$zero # 0-round key
1004 lea 0x78($key),$key # size optimization
1008 .Ldec8x_loop_grande:
1009 #mov $num,24(%rsp) # original $num
1012 for($i=0;$i<8;$i++) {
1013 my $temp = $i ? $offload : $offset;
1015 mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
1016 mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
1018 mov `40*$i+8-40*4`($inp),$temp # output pointer
1019 cmovg $one,$num # find maximum
1021 vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
1022 mov $one,`32+4*$i`(%rsp) # initialize counters
1023 cmovle %rsp,@ptr[$i] # cancel input
1024 sub @ptr[$i],$temp # distance between input and output
1025 mov $temp,`64+8*$i`(%rsp) # initialize distances
1026 vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV
1033 vmovups 0x10-0x78($key),$rndkey1
1034 vmovups 0x20-0x78($key),$rndkey0
1035 mov 0xf0-0x78($key),$rounds
1036 lea 192+128(%rsp),$offload # offload area
1038 vmovdqu (@ptr[0]),@out[0] # load inputs
1039 vmovdqu (@ptr[1]),@out[1]
1040 vmovdqu (@ptr[2]),@out[2]
1041 vmovdqu (@ptr[3]),@out[3]
1042 vmovdqu (@ptr[4]),@out[4]
1043 vmovdqu (@ptr[5]),@out[5]
1044 vmovdqu (@ptr[6]),@out[6]
1045 vmovdqu (@ptr[7]),@out[7]
1046 vmovdqu @out[0],0x00($offload) # offload inputs
1047 vpxor $zero,@out[0],@out[0] # xor inputs with 0-round
1048 vmovdqu @out[1],0x10($offload)
1049 vpxor $zero,@out[1],@out[1]
1050 vmovdqu @out[2],0x20($offload)
1051 vpxor $zero,@out[2],@out[2]
1052 vmovdqu @out[3],0x30($offload)
1053 vpxor $zero,@out[3],@out[3]
1054 vmovdqu @out[4],0x40($offload)
1055 vpxor $zero,@out[4],@out[4]
1056 vmovdqu @out[5],0x50($offload)
1057 vpxor $zero,@out[5],@out[5]
1058 vmovdqu @out[6],0x60($offload)
1059 vpxor $zero,@out[6],@out[6]
1060 vmovdqu @out[7],0x70($offload)
1061 vpxor $zero,@out[7],@out[7]
1063 mov \$1,$one # constant of 1
1069 for($i=0;$i<8;$i++) {
1070 my $rndkey=($i&1)?$rndkey0:$rndkey1;
1072 vaesdec $rndkey,@out[0],@out[0]
1073 cmp 32+4*$i(%rsp),$one
1075 $code.=<<___ if ($i);
1076 mov 64+8*$i(%rsp),$offset
1079 vaesdec $rndkey,@out[1],@out[1]
1080 prefetcht0 31(@ptr[$i]) # prefetch input
1081 vaesdec $rndkey,@out[2],@out[2]
1083 $code.=<<___ if ($i>1);
1084 prefetcht0 15(@ptr[$i-2]) # prefetch output
1087 vaesdec $rndkey,@out[3],@out[3]
1088 lea (@ptr[$i],$offset),$offset
1089 cmovge %rsp,@ptr[$i] # cancel input
1090 vaesdec $rndkey,@out[4],@out[4]
1091 cmovg %rsp,$offset # sink output
1092 vaesdec $rndkey,@out[5],@out[5]
1093 sub @ptr[$i],$offset
1094 vaesdec $rndkey,@out[6],@out[6]
1095 vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input
1096 mov $offset,64+8*$i(%rsp)
1097 vaesdec $rndkey,@out[7],@out[7]
1098 vmovups `16*(3+$i)-0x78`($key),$rndkey
1099 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
1101 $code.=<<___ if ($i<4);
1102 vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load
1106 vmovdqu 32(%rsp),$counters
1107 prefetcht0 15(@ptr[$i-2]) # prefetch output
1108 prefetcht0 15(@ptr[$i-1])
1112 vaesdec $rndkey1,@out[0],@out[0]
1113 vaesdec $rndkey1,@out[1],@out[1]
1114 vaesdec $rndkey1,@out[2],@out[2]
1115 vaesdec $rndkey1,@out[3],@out[3]
1116 vaesdec $rndkey1,@out[4],@out[4]
1117 vaesdec $rndkey1,@out[5],@out[5]
1118 vaesdec $rndkey1,@out[6],@out[6]
1119 vaesdec $rndkey1,@out[7],@out[7]
1120 vmovups 0xb0-0x78($key),$rndkey1
1122 vaesdec $rndkey0,@out[0],@out[0]
1123 vaesdec $rndkey0,@out[1],@out[1]
1124 vaesdec $rndkey0,@out[2],@out[2]
1125 vaesdec $rndkey0,@out[3],@out[3]
1126 vaesdec $rndkey0,@out[4],@out[4]
1127 vaesdec $rndkey0,@out[5],@out[5]
1128 vaesdec $rndkey0,@out[6],@out[6]
1129 vaesdec $rndkey0,@out[7],@out[7]
1130 vmovups 0xc0-0x78($key),$rndkey0
1133 vaesdec $rndkey1,@out[0],@out[0]
1134 vaesdec $rndkey1,@out[1],@out[1]
1135 vaesdec $rndkey1,@out[2],@out[2]
1136 vaesdec $rndkey1,@out[3],@out[3]
1137 vaesdec $rndkey1,@out[4],@out[4]
1138 vaesdec $rndkey1,@out[5],@out[5]
1139 vaesdec $rndkey1,@out[6],@out[6]
1140 vaesdec $rndkey1,@out[7],@out[7]
1141 vmovups 0xd0-0x78($key),$rndkey1
1143 vaesdec $rndkey0,@out[0],@out[0]
1144 vaesdec $rndkey0,@out[1],@out[1]
1145 vaesdec $rndkey0,@out[2],@out[2]
1146 vaesdec $rndkey0,@out[3],@out[3]
1147 vaesdec $rndkey0,@out[4],@out[4]
1148 vaesdec $rndkey0,@out[5],@out[5]
1149 vaesdec $rndkey0,@out[6],@out[6]
1150 vaesdec $rndkey0,@out[7],@out[7]
1151 vmovups 0xe0-0x78($key),$rndkey0
1154 vaesdec $rndkey1,@out[0],@out[0]
1155 vpxor $zero,$zero,$zero
1156 vaesdec $rndkey1,@out[1],@out[1]
1157 vaesdec $rndkey1,@out[2],@out[2]
1158 vpcmpgtd $zero,$counters,$zero
1159 vaesdec $rndkey1,@out[3],@out[3]
1160 vaesdec $rndkey1,@out[4],@out[4]
1161 vpaddd $counters,$zero,$zero # decrement counters
1162 vmovdqu 48(%rsp),$counters
1163 vaesdec $rndkey1,@out[5],@out[5]
1164 mov 64(%rsp),$offset # pre-load 1st offset
1165 vaesdec $rndkey1,@out[6],@out[6]
1166 vaesdec $rndkey1,@out[7],@out[7]
1167 vmovups 0x10-0x78($key),$rndkey1
1169 vaesdeclast $rndkey0,@out[0],@out[0]
1170 vmovdqa $zero,32(%rsp) # update counters
1171 vpxor $zero,$zero,$zero
1172 vaesdeclast $rndkey0,@out[1],@out[1]
1173 vpxor 0x00($offload),@out[0],@out[0] # xor with IV
1174 vaesdeclast $rndkey0,@out[2],@out[2]
1175 vpxor 0x10($offload),@out[1],@out[1]
1176 vpcmpgtd $zero,$counters,$zero
1177 vaesdeclast $rndkey0,@out[3],@out[3]
1178 vpxor 0x20($offload),@out[2],@out[2]
1179 vaesdeclast $rndkey0,@out[4],@out[4]
1180 vpxor 0x30($offload),@out[3],@out[3]
1181 vpaddd $zero,$counters,$counters # decrement counters
1182 vmovdqu -0x78($key),$zero # 0-round
1183 vaesdeclast $rndkey0,@out[5],@out[5]
1184 vpxor 0x40($offload),@out[4],@out[4]
1185 vaesdeclast $rndkey0,@out[6],@out[6]
1186 vpxor 0x50($offload),@out[5],@out[5]
1187 vmovdqa $counters,48(%rsp) # update counters
1188 vaesdeclast $rndkey0,@out[7],@out[7]
1189 vpxor 0x60($offload),@out[6],@out[6]
1190 vmovups 0x20-0x78($key),$rndkey0
1192 vmovups @out[0],-16(@ptr[0]) # write output
1193 sub $offset,@ptr[0] # switch to input
1194 vmovdqu 128+0(%rsp),@out[0]
1195 vpxor 0x70($offload),@out[7],@out[7]
1196 vmovups @out[1],-16(@ptr[1])
1197 sub `64+1*8`(%rsp),@ptr[1]
1198 vmovdqu @out[0],0x00($offload)
1199 vpxor $zero,@out[0],@out[0]
1200 vmovdqu 128+16(%rsp),@out[1]
1201 vmovups @out[2],-16(@ptr[2])
1202 sub `64+2*8`(%rsp),@ptr[2]
1203 vmovdqu @out[1],0x10($offload)
1204 vpxor $zero,@out[1],@out[1]
1205 vmovdqu 128+32(%rsp),@out[2]
1206 vmovups @out[3],-16(@ptr[3])
1207 sub `64+3*8`(%rsp),@ptr[3]
1208 vmovdqu @out[2],0x20($offload)
1209 vpxor $zero,@out[2],@out[2]
1210 vmovdqu 128+48(%rsp),@out[3]
1211 vmovups @out[4],-16(@ptr[4])
1212 sub `64+4*8`(%rsp),@ptr[4]
1213 vmovdqu @out[3],0x30($offload)
1214 vpxor $zero,@out[3],@out[3]
1215 vmovdqu @inp[0],0x40($offload)
1216 vpxor @inp[0],$zero,@out[4]
1217 vmovups @out[5],-16(@ptr[5])
1218 sub `64+5*8`(%rsp),@ptr[5]
1219 vmovdqu @inp[1],0x50($offload)
1220 vpxor @inp[1],$zero,@out[5]
1221 vmovups @out[6],-16(@ptr[6])
1222 sub `64+6*8`(%rsp),@ptr[6]
1223 vmovdqu @inp[2],0x60($offload)
1224 vpxor @inp[2],$zero,@out[6]
1225 vmovups @out[7],-16(@ptr[7])
1226 sub `64+7*8`(%rsp),@ptr[7]
1227 vmovdqu @inp[3],0x70($offload)
1228 vpxor @inp[3],$zero,@out[7]
1234 mov 16(%rsp),%rax # original %rsp
1237 #lea `40*8`($inp),$inp
1239 #jnz .Ldec8x_loop_grande
1244 $code.=<<___ if ($win64);
1245 movaps -0xd8(%rax),%xmm6
1246 movaps -0xc8(%rax),%xmm7
1247 movaps -0xb8(%rax),%xmm8
1248 movaps -0xa8(%rax),%xmm9
1249 movaps -0x98(%rax),%xmm10
1250 movaps -0x88(%rax),%xmm11
1251 movaps -0x78(%rax),%xmm12
1252 movaps -0x68(%rax),%xmm13
1253 movaps -0x58(%rax),%xmm14
1254 movaps -0x48(%rax),%xmm15
1270 .cfi_def_cfa_register %rsp
1274 .size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
1279 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1280 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1287 .extern __imp_RtlVirtualUnwind
1288 .type se_handler,\@abi-omnipotent
1302 mov 120($context),%rax # pull context->Rax
1303 mov 248($context),%rbx # pull context->Rip
1305 mov 8($disp),%rsi # disp->ImageBase
1306 mov 56($disp),%r11 # disp->HandlerData
1308 mov 0(%r11),%r10d # HandlerData[0]
1309 lea (%rsi,%r10),%r10 # prologue label
1310 cmp %r10,%rbx # context->Rip<.Lprologue
1313 mov 152($context),%rax # pull context->Rsp
1315 mov 4(%r11),%r10d # HandlerData[1]
1316 lea (%rsi,%r10),%r10 # epilogue label
1317 cmp %r10,%rbx # context->Rip>=.Lepilogue
1320 mov 16(%rax),%rax # pull saved stack pointer
1328 mov %rbx,144($context) # restore context->Rbx
1329 mov %rbp,160($context) # restore context->Rbp
1330 mov %r12,216($context) # restore context->R12
1331 mov %r13,224($context) # restore context->R13
1332 mov %r14,232($context) # restore context->R14
1333 mov %r15,240($context) # restore context->R15
1335 lea -56-10*16(%rax),%rsi
1336 lea 512($context),%rdi # &context.Xmm6
1338 .long 0xa548f3fc # cld; rep movsq
1343 mov %rax,152($context) # restore context->Rsp
1344 mov %rsi,168($context) # restore context->Rsi
1345 mov %rdi,176($context) # restore context->Rdi
1347 mov 40($disp),%rdi # disp->ContextRecord
1348 mov $context,%rsi # context
1349 mov \$154,%ecx # sizeof(CONTEXT)
1350 .long 0xa548f3fc # cld; rep movsq
1353 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1354 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1355 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1356 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1357 mov 40(%rsi),%r10 # disp->ContextRecord
1358 lea 56(%rsi),%r11 # &disp->HandlerData
1359 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1360 mov %r10,32(%rsp) # arg5
1361 mov %r11,40(%rsp) # arg6
1362 mov %r12,48(%rsp) # arg7
1363 mov %rcx,56(%rsp) # arg8, (NULL)
1364 call *__imp_RtlVirtualUnwind(%rip)
1366 mov \$1,%eax # ExceptionContinueSearch
1378 .size se_handler,.-se_handler
1382 .rva .LSEH_begin_aesni_multi_cbc_encrypt
1383 .rva .LSEH_end_aesni_multi_cbc_encrypt
1384 .rva .LSEH_info_aesni_multi_cbc_encrypt
1385 .rva .LSEH_begin_aesni_multi_cbc_decrypt
1386 .rva .LSEH_end_aesni_multi_cbc_decrypt
1387 .rva .LSEH_info_aesni_multi_cbc_decrypt
1389 $code.=<<___ if ($avx);
1390 .rva .LSEH_begin_aesni_multi_cbc_encrypt_avx
1391 .rva .LSEH_end_aesni_multi_cbc_encrypt_avx
1392 .rva .LSEH_info_aesni_multi_cbc_encrypt_avx
1393 .rva .LSEH_begin_aesni_multi_cbc_decrypt_avx
1394 .rva .LSEH_end_aesni_multi_cbc_decrypt_avx
1395 .rva .LSEH_info_aesni_multi_cbc_decrypt_avx
1400 .LSEH_info_aesni_multi_cbc_encrypt:
1403 .rva .Lenc4x_body,.Lenc4x_epilogue # HandlerData[]
1404 .LSEH_info_aesni_multi_cbc_decrypt:
1407 .rva .Ldec4x_body,.Ldec4x_epilogue # HandlerData[]
1409 $code.=<<___ if ($avx);
1410 .LSEH_info_aesni_multi_cbc_encrypt_avx:
1413 .rva .Lenc8x_body,.Lenc8x_epilogue # HandlerData[]
1414 .LSEH_info_aesni_multi_cbc_decrypt_avx:
1417 .rva .Ldec8x_body,.Ldec8x_epilogue # HandlerData[]
1420 ####################################################################
1423 local *opcode=shift;
1427 $rex|=0x04 if($dst>=8);
1428 $rex|=0x01 if($src>=8);
1429 push @opcode,$rex|0x40 if($rex);
1436 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1437 rex(\@opcode,$4,$3);
1438 push @opcode,0x0f,0x3a,0xdf;
1439 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
1441 push @opcode,$c=~/^0/?oct($c):$c;
1442 return ".byte\t".join(',',@opcode);
1444 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1447 "aesenc" => 0xdc, "aesenclast" => 0xdd,
1448 "aesdec" => 0xde, "aesdeclast" => 0xdf
1450 return undef if (!defined($opcodelet{$1}));
1451 rex(\@opcode,$3,$2);
1452 push @opcode,0x0f,0x38,$opcodelet{$1};
1453 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
1454 return ".byte\t".join(',',@opcode);
1456 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
1458 "aesenc" => 0xdc, "aesenclast" => 0xdd,
1459 "aesdec" => 0xde, "aesdeclast" => 0xdf
1461 return undef if (!defined($opcodelet{$1}));
1463 push @opcode,0x44 if ($3>=8);
1464 push @opcode,0x0f,0x38,$opcodelet{$1};
1465 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
1466 push @opcode,($off=~/^0/?oct($off):$off)&0xff;
1467 return ".byte\t".join(',',@opcode);
1472 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1473 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1476 close STDOUT or die "error closing STDOUT: $!";