3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # This module implements support for Intel AES-NI extension. In
11 # OpenSSL context it's used with Intel engine, but can also be used as
12 # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
16 # - Win64 SEH handlers;
18 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
19 # generates drop-in replacement for
20 # crypto/aes/asm/aes-x86_64.pl:-)
24 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
26 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
28 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
29 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
30 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
31 die "can't locate x86_64-xlate.pl";
33 open STDOUT,"| $^X $xlate $flavour $output";
35 $movkey = $PREFIX eq "aesni" ? "movaps" : "movups";
39 $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
41 # this is natural argument order for public $PREFIX_*crypt...
44 # ... and for $PREFIX_[ebc|cbc]_encrypt in particular.
46 $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
49 $rnds_="%r10d"; # backup copy for $rounds
50 $key_="%r11"; # backup copy for $key
52 # %xmm register layout
53 $inout0="%xmm0"; $inout1="%xmm1";
54 $inout2="%xmm2"; $inout3="%xmm3";
55 $inout4="%xmm4"; $inout5="%xmm5";
56 $rndkey0="%xmm6"; $rndkey1="%xmm7";
59 $in0="%xmm9"; $in1="%xmm10";
60 $in2="%xmm11"; $in3="%xmm12";
61 $in4="%xmm13"; $in5="%xmm14";
63 # Inline version of internal aesni_[en|de]crypt1.
65 # Why folded loop? Because aes[enc|dec] is slow enough to accommodate
66 # cycles which take care of loop variables...
69 my ($data,$rndkey0,$rndkey1,$key,$rounds)=@_;
72 $movkey ($key),$rndkey0
73 $movkey 16($key),$rndkey1
83 $movkey ($key),$rndkey1
84 jnz .Loop_enc1_$sn # loop body is 16 bytes
86 aesenclast $rndkey1,$data
91 my ($data,$rndkey0,$rndkey1,$key,$rounds)=@_;
94 $movkey ($key),$rndkey0
95 $movkey 16($key),$rndkey1
102 aesdec $rndkey1,$data
105 $movkey ($key),$rndkey1
106 jnz .Loop_dec1_$sn # loop body is 16 bytes
108 aesdeclast $rndkey1,$data
112 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
115 .globl ${PREFIX}_encrypt
116 .type ${PREFIX}_encrypt,\@function,3
119 movups ($inp),%xmm0 # load input
120 mov 240(%rdx),$rounds # pull $rounds
122 &aesni_encrypt1("%xmm0","%xmm1","%xmm2","%rdx",$rounds);
124 movups %xmm0,(%rsi) # output
126 .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
129 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
132 .globl ${PREFIX}_decrypt
133 .type ${PREFIX}_decrypt,\@function,3
136 movups ($inp),%xmm0 # load input
137 mov 240(%rdx),$rounds # pull $rounds
139 &aesni_decrypt1("%xmm0","%xmm1","%xmm2","%rdx",$rounds);
141 movups %xmm0,($out) # output
143 .size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
146 # _aesni_[en|de]crypt6 are private interfaces, 6 denotes interleave
147 # factor. Why 6x? Because aes[enc|dec] latency is 6 and 6x interleave
148 # provides optimal utilization, so that subroutine's throughput is
149 # virtually same for *any* number [naturally up to 6] of input blocks
150 # as for non-interleaved subroutine. This is why it handles even
151 # double-, tripple-, quad- and penta-block inputs. Larger interleave
152 # factor, e.g. 8x, would perform suboptimally on these shorter inputs...
153 sub aesni_generate6 {
155 # As already mentioned it takes in $key and $rounds, which are *not*
156 # preserved. $inout[0-5] is cipher/clear text...
158 .type _aesni_${dir}rypt6,\@abi-omnipotent
161 $movkey ($key),$rndkey0
162 $movkey 16($key),$rndkey1
166 pxor $rndkey0,$inout0
167 pxor $rndkey0,$inout1
168 pxor $rndkey0,$inout2
169 pxor $rndkey0,$inout3
170 pxor $rndkey0,$inout4
171 pxor $rndkey0,$inout5
175 aes${dir} $rndkey1,$inout0
176 $movkey ($key),$rndkey0
177 aes${dir} $rndkey1,$inout1
179 aes${dir} $rndkey1,$inout2
180 aes${dir} $rndkey1,$inout3
181 aes${dir} $rndkey1,$inout4
182 aes${dir} $rndkey1,$inout5
183 aes${dir} $rndkey0,$inout0
184 $movkey 16($key),$rndkey1
185 aes${dir} $rndkey0,$inout1
187 aes${dir} $rndkey0,$inout2
188 aes${dir} $rndkey0,$inout3
189 aes${dir} $rndkey0,$inout4
190 aes${dir} $rndkey0,$inout5
192 aes${dir} $rndkey1,$inout0
193 $movkey ($key),$rndkey0
194 aes${dir} $rndkey1,$inout1
195 aes${dir} $rndkey1,$inout2
196 aes${dir} $rndkey1,$inout3
197 aes${dir} $rndkey1,$inout4
198 aes${dir} $rndkey1,$inout5
199 aes${dir}last $rndkey0,$inout0
200 aes${dir}last $rndkey0,$inout1
201 aes${dir}last $rndkey0,$inout2
202 aes${dir}last $rndkey0,$inout3
203 aes${dir}last $rndkey0,$inout4
204 aes${dir}last $rndkey0,$inout5
206 .size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
209 &aesni_generate6("enc");
210 &aesni_generate6("dec");
212 if ($PREFIX eq "aesni") {
213 # void aesni_ecb_encrypt (const void *in, void *out,
214 # size_t length, const AES_KEY *key,
217 .globl aesni_ecb_encrypt
218 .type aesni_ecb_encrypt,\@function,5
221 cmp \$16,$len # check length
224 $code.=<<___ if ($win64);
227 movaps %xmm7,16(%rsp)
230 mov 240($key),$rounds # pull $rounds
232 mov $key,$key_ # backup $key
234 mov $rounds,$rnds_ # backup $rounds
236 #--------------------------- ECB ENCRYPT ------------------------------#
242 movups ($inp),$inout0
243 movups 0x10($inp),$inout1
244 movups 0x20($inp),$inout2
245 movups 0x30($inp),$inout3
246 movups 0x40($inp),$inout4
247 movups 0x50($inp),$inout5
249 movups $inout0,($out)
251 movups $inout1,0x10($out)
253 movups $inout2,0x20($out)
254 mov $rnds_,$rounds # restore $rounds
255 movups $inout3,0x30($out)
256 mov $key_,$key # restore $key
257 movups $inout4,0x40($out)
258 movups $inout5,0x50($out)
267 movups ($inp),$inout0
270 movups 0x10($inp),$inout1
273 movups 0x20($inp),$inout2
276 movups 0x30($inp),$inout3
278 movups 0x40($inp),$inout4
280 movups $inout0,($out)
281 movups $inout1,0x10($out)
282 movups $inout2,0x20($out)
283 movups $inout3,0x30($out)
284 movups $inout4,0x40($out)
289 &aesni_encrypt1($inout0,$rndkey0,$rndkey1,$key,$rounds);
291 movups $inout0,($out)
296 movups $inout0,($out)
297 movups $inout1,0x10($out)
302 movups $inout0,($out)
303 movups $inout1,0x10($out)
304 movups $inout2,0x20($out)
309 movups $inout0,($out)
310 movups $inout1,0x10($out)
311 movups $inout2,0x20($out)
312 movups $inout3,0x30($out)
314 \f#--------------------------- ECB DECRYPT ------------------------------#
322 movups ($inp),$inout0
323 movups 0x10($inp),$inout1
324 movups 0x20($inp),$inout2
325 movups 0x30($inp),$inout3
326 movups 0x40($inp),$inout4
327 movups 0x50($inp),$inout5
329 movups $inout0,($out)
331 movups $inout1,0x10($out)
333 movups $inout2,0x20($out)
334 mov $rnds_,$rounds # restore $rounds
335 movups $inout3,0x30($out)
336 mov $key_,$key # restore $key
337 movups $inout4,0x40($out)
338 movups $inout5,0x50($out)
347 movups ($inp),$inout0
350 movups 0x10($inp),$inout1
353 movups 0x20($inp),$inout2
356 movups 0x30($inp),$inout3
358 movups 0x40($inp),$inout4
360 movups $inout0,($out)
361 movups $inout1,0x10($out)
362 movups $inout2,0x20($out)
363 movups $inout3,0x30($out)
364 movups $inout4,0x40($out)
369 &aesni_decrypt1($inout0,$rndkey0,$rndkey1,$key,$rounds);
371 movups $inout0,($out)
376 movups $inout0,($out)
377 movups $inout1,0x10($out)
382 movups $inout0,($out)
383 movups $inout1,0x10($out)
384 movups $inout2,0x20($out)
389 movups $inout0,($out)
390 movups $inout1,0x10($out)
391 movups $inout2,0x20($out)
392 movups $inout3,0x30($out)
396 $code.=<<___ if ($win64);
398 movaps 0x10(%rsp),%xmm7
404 .size aesni_ecb_encrypt,.-aesni_ecb_encrypt
408 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
409 # size_t length, const AES_KEY *key,
410 # unsigned char *ivp,const int enc);
411 $reserved = $win64?0x90:-0x18; # used in decrypt
413 .globl ${PREFIX}_cbc_encrypt
414 .type ${PREFIX}_cbc_encrypt,\@function,6
416 ${PREFIX}_cbc_encrypt:
417 test $len,$len # check length
419 mov 240($key),$rounds # pull $rounds
420 mov $key,$key_ # backup $key
422 mov $rounds,$rnds_ # backup $rounds
424 #--------------------------- CBC ENCRYPT ------------------------------#
425 movups ($ivp),%xmm0 # load iv as initial state
432 movups ($inp),%xmm2 # load input
436 &aesni_encrypt1("%xmm0","%xmm1","%xmm2",$key,$rounds);
438 movups %xmm0,($out) # store output
441 mov $rnds_,$rounds # restore $rounds
442 mov $key_,$key # restore $key
450 mov $len,%rcx # zaps $key
451 xchg $inp,$out # $inp is %rsi and $out is %rdi now
452 .long 0x9066A4F3 # rep movsb
453 mov \$16,%ecx # zero tail
456 .long 0x9066AAF3 # rep stosb
457 lea -16(%rdi),%rdi # rewind $out by 1 block
458 mov $rnds_,$rounds # restore $rounds
459 mov %rdi,%rsi # $inp and $out are the same
460 mov $key_,$key # restore $key
461 xor $len,$len # len=16
462 jmp .Lcbc_enc_loop # one more spin
463 \f#--------------------------- CBC DECRYPT ------------------------------#
467 $code.=<<___ if ($win64);
470 movaps %xmm7,0x10(%rsp)
471 movaps %xmm8,0x20(%rsp)
472 movaps %xmm9,0x30(%rsp)
473 movaps %xmm10,0x40(%rsp)
474 movaps %xmm11,0x50(%rsp)
475 movaps %xmm12,0x60(%rsp)
476 movaps %xmm13,0x70(%rsp)
477 movaps %xmm14,0x80(%rsp)
486 movups ($inp),$inout0
487 movups 0x10($inp),$inout1
488 movups 0x20($inp),$inout2
489 movups 0x30($inp),$inout3
491 movups 0x40($inp),$inout4
493 movups 0x50($inp),$inout5
501 movups $inout0,($out)
504 movups $inout1,0x10($out)
507 movups $inout2,0x20($out)
508 mov $rnds_,$rounds # restore $rounds
510 movups $inout3,0x30($out)
511 mov $key_,$key # restore $key
513 movups $inout4,0x40($out)
515 movups $inout5,0x50($out)
524 movups ($inp),$inout0
528 movups 0x10($inp),$inout1
532 movups 0x20($inp),$inout2
536 movups 0x30($inp),$inout3
540 movups 0x40($inp),$inout4
544 movups 0x50($inp),$inout5
549 movups $inout0,($out)
551 movups $inout1,0x10($out)
553 movups $inout2,0x20($out)
555 movups $inout3,0x30($out)
557 movups $inout4,0x40($out)
559 movaps $inout5,$inout0
561 jmp .Lcbc_dec_tail_collected
565 &aesni_decrypt1($inout0,$rndkey0,$rndkey1,$key,$rounds);
569 jmp .Lcbc_dec_tail_collected
575 movups $inout0,($out)
577 movaps $inout1,$inout0
579 jmp .Lcbc_dec_tail_collected
585 movups $inout0,($out)
587 movups $inout1,0x10($out)
589 movaps $inout2,$inout0
591 jmp .Lcbc_dec_tail_collected
597 movups $inout0,($out)
599 movups $inout1,0x10($out)
601 movups $inout2,0x20($out)
603 movaps $inout3,$inout0
605 jmp .Lcbc_dec_tail_collected
611 movups $inout0,($out)
613 movups $inout1,0x10($out)
615 movups $inout2,0x20($out)
617 movups $inout3,0x30($out)
619 movaps $inout4,$inout0
621 jmp .Lcbc_dec_tail_collected
623 .Lcbc_dec_tail_collected:
626 jnz .Lcbc_dec_tail_partial
627 movups $inout0,($out)
629 .Lcbc_dec_tail_partial:
630 movaps $inout0,$reserved(%rsp)
633 lea $reserved(%rsp),%rsi
634 .long 0x9066A4F3 # rep movsb
638 $code.=<<___ if ($win64);
640 movaps 0x10(%rsp),%xmm7
641 movaps 0x20(%rsp),%xmm8
642 movaps 0x30(%rsp),%xmm9
643 movaps 0x40(%rsp),%xmm10
644 movaps 0x50(%rsp),%xmm11
645 movaps 0x60(%rsp),%xmm12
646 movaps 0x70(%rsp),%xmm13
647 movaps 0x80(%rsp),%xmm14
653 .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
657 # this is natural argument order for $PREFIX_set_[en|de]crypt_key
662 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
665 .globl ${PREFIX}_set_encrypt_key
666 .type ${PREFIX}_set_encrypt_key,\@function,3
668 ${PREFIX}_set_encrypt_key:
669 call _aesni_set_encrypt_key
671 .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
673 # int $PREFIX_set_decrypt_key(const unsigned char *userKey, const int bits,
676 .globl ${PREFIX}_set_decrypt_key
677 .type ${PREFIX}_set_decrypt_key,\@function,3
679 ${PREFIX}_set_decrypt_key:
680 call _aesni_set_encrypt_key
681 shl \$4,%esi # actually rounds after _aesni_set_encrypt_key
684 lea (%rdx,%rsi),%rsi# points at the end of key schedule
686 $movkey (%rdx),%xmm0 # just swap
692 jmp .Ldec_key_inverse
695 $movkey (%rdx),%xmm0 # swap and inverse
702 $movkey %xmm0,16(%rsi)
703 $movkey %xmm1,-16(%rdx)
706 $movkey (%rdx),%xmm0 # inverse middle
711 .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
714 # This is based on submission by
716 # Huang Ying <ying.huang@intel.com>
717 # Vinodh Gopal <vinodh.gopal@intel.com>
720 # Agressively optimized in respect to aeskeygenassist's critical path
721 # and is contained in %xmm0-5 to meet Win64 ABI requirement.
724 .type _aesni_set_encrypt_key,\@abi-omnipotent
726 _aesni_set_encrypt_key:
732 movups (%rdi),%xmm0 # pull first 128 bits of *userKey
733 pxor %xmm4,%xmm4 # low dword of xmm4 is assumed 0
743 mov \$10,%esi # 10 rounds for 128-bit key
744 $movkey %xmm0,(%rdx) # round 0
745 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
746 call .Lkey_expansion_128_cold
747 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
748 call .Lkey_expansion_128
749 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
750 call .Lkey_expansion_128
751 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
752 call .Lkey_expansion_128
753 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
754 call .Lkey_expansion_128
755 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
756 call .Lkey_expansion_128
757 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
758 call .Lkey_expansion_128
759 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
760 call .Lkey_expansion_128
761 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
762 call .Lkey_expansion_128
763 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
764 call .Lkey_expansion_128
766 mov %esi,80(%rcx) # 240(%rdx)
774 .Lkey_expansion_128_cold:
775 shufps \$0b00010000,%xmm0,%xmm4
777 shufps \$0b10001100,%xmm0,%xmm4
779 pshufd \$0b11111111,%xmm1,%xmm1 # critical path
785 movq 16(%rdi),%xmm2 # remaining 1/3 of *userKey
786 mov \$12,%esi # 12 rounds for 192
787 $movkey %xmm0,(%rdx) # round 0
788 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
789 call .Lkey_expansion_192a_cold
790 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
791 call .Lkey_expansion_192b
792 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
793 call .Lkey_expansion_192a
794 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
795 call .Lkey_expansion_192b
796 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
797 call .Lkey_expansion_192a
798 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
799 call .Lkey_expansion_192b
800 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
801 call .Lkey_expansion_192a
802 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
803 call .Lkey_expansion_192b
805 mov %esi,48(%rcx) # 240(%rdx)
810 .Lkey_expansion_192a:
813 .Lkey_expansion_192a_cold:
815 .Lkey_expansion_192b_warm:
816 shufps \$0b00010000,%xmm0,%xmm4
819 shufps \$0b10001100,%xmm0,%xmm4
822 pshufd \$0b01010101,%xmm1,%xmm1 # critical path
825 pshufd \$0b11111111,%xmm0,%xmm3
830 .Lkey_expansion_192b:
832 shufps \$0b01000100,%xmm0,%xmm5
834 shufps \$0b01001110,%xmm2,%xmm3
835 $movkey %xmm3,16(%rcx)
837 jmp .Lkey_expansion_192b_warm
841 movups 16(%rdi),%xmm2 # remaning half of *userKey
842 mov \$14,%esi # 14 rounds for 256
844 $movkey %xmm0,(%rdx) # round 0
845 $movkey %xmm2,16(%rdx) # round 1
846 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
847 call .Lkey_expansion_256a_cold
848 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
849 call .Lkey_expansion_256b
850 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
851 call .Lkey_expansion_256a
852 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
853 call .Lkey_expansion_256b
854 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
855 call .Lkey_expansion_256a
856 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
857 call .Lkey_expansion_256b
858 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
859 call .Lkey_expansion_256a
860 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
861 call .Lkey_expansion_256b
862 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
863 call .Lkey_expansion_256a
864 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
865 call .Lkey_expansion_256b
866 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
867 call .Lkey_expansion_256a
868 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
869 call .Lkey_expansion_256b
870 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
871 call .Lkey_expansion_256a
873 mov %esi,16(%rcx) # 240(%rdx)
878 .Lkey_expansion_256a:
881 .Lkey_expansion_256a_cold:
882 shufps \$0b00010000,%xmm0,%xmm4
884 shufps \$0b10001100,%xmm0,%xmm4
886 pshufd \$0b11111111,%xmm1,%xmm1 # critical path
891 .Lkey_expansion_256b:
895 shufps \$0b00010000,%xmm2,%xmm4
897 shufps \$0b10001100,%xmm2,%xmm4
899 pshufd \$0b10101010,%xmm1,%xmm1 # critical path
910 .size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key
915 .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
923 if ($dst>=8 || $src>=8) {
925 $rex|=0x04 if($dst>=8);
926 $rex|=0x01 if($src>=8);
935 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
937 push @opcode,0x0f,0x3a,0xdf;
938 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
940 push @opcode,$c=~/^0/?oct($c):$c;
941 return ".byte\t".join(',',@opcode);
943 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
946 "aesenc" => 0xdc, "aesenclast" => 0xdd,
947 "aesdec" => 0xde, "aesdeclast" => 0xdf
949 return undef if (!defined($opcodelet{$1}));
951 push @opcode,0x0f,0x38,$opcodelet{$1};
952 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
953 return ".byte\t".join(',',@opcode);
958 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
959 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;