2 # Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ###################################################################
11 ### AES-128 [originally in CTR mode] ###
12 ### bitsliced implementation for Intel Core 2 processors ###
13 ### requires support of SSE extensions up to SSSE3 ###
14 ### Author: Emilia Käsper and Peter Schwabe ###
15 ### Date: 2009-03-19 ###
18 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
19 ### further information. ###
20 ###################################################################
24 # Started as transliteration to "perlasm" the original code has
25 # undergone following changes:
27 # - code was made position-independent;
28 # - rounds were folded into a loop resulting in >5x size reduction
29 # from 12.5KB to 2.2KB;
30 # - above was possible thanks to mixcolumns() modification that
31 # allowed to feed its output back to aesenc[last], this was
32 # achieved at cost of two additional inter-registers moves;
33 # - some instruction reordering and interleaving;
34 # - this module doesn't implement key setup subroutine, instead it
35 # relies on conversion of "conventional" key schedule as returned
36 # by AES_set_encrypt_key (see discussion below);
37 # - first and last round keys are treated differently, which allowed
38 # to skip one shiftrows(), reduce bit-sliced key schedule and
39 # speed-up conversion by 22%;
40 # - support for 192- and 256-bit keys was added;
42 # Resulting performance in CPU cycles spent to encrypt one byte out
43 # of 4096-byte buffer with 128-bit key is:
45 # Emilia's this(*) difference
47 # Core 2 9.30 8.69 +7%
48 # Nehalem(**) 7.63 6.88 +11%
53 # (*) Comparison is not completely fair, because "this" is ECB,
54 # i.e. no extra processing such as counter values calculation
55 # and xor-ing input as in Emilia's CTR implementation is
56 # performed. However, the CTR calculations stand for not more
57 # than 1% of total time, so comparison is *rather* fair.
59 # (**) Results were collected on Westmere, which is considered to
60 # be equivalent to Nehalem for this code.
62 # As for key schedule conversion subroutine. Interface to OpenSSL
63 # relies on per-invocation on-the-fly conversion. This naturally
64 # has impact on performance, especially for short inputs. Conversion
65 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
68 # conversion conversion/8x block
73 # The ratio values mean that 128-byte blocks will be processed
74 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
75 # etc. Then keep in mind that input sizes not divisible by 128 are
76 # *effectively* slower, especially shortest ones, e.g. consecutive
77 # 144-byte blocks are processed 44% slower than one would expect,
78 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
79 # it's still faster than ["hyper-threading-safe" code path in]
80 # aes-x86_64.pl on all lengths above 64 bytes...
84 # Add decryption procedure. Performance in CPU cycles spent to decrypt
85 # one byte out of 4096-byte buffer with 128-bit key is:
95 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
96 # suboptimal, but XTS is meant to be used with larger blocks...
100 # $output is the last argument if it looks like a file (it has an extension)
101 # $flavour is the first argument if it doesn't look like a file
102 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
103 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
105 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
107 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
108 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
109 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
110 die "can't locate x86_64-xlate.pl";
112 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
113 or die "can't call $xlate: $!";
116 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
117 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
118 my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
121 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
124 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
125 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
130 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
131 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
135 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
136 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
158 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
159 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
179 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
180 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
184 &InvInBasisChange (@b);
185 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
186 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
189 sub InvInBasisChange { # OutBasisChange in reverse
190 my @b=@_[5,1,2,6,3,7,0,4];
208 sub InvOutBasisChange { # InBasisChange in reverse
209 my @b=@_[2,5,7,3,6,1,0,4];
230 #;*************************************************************
231 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
232 #;*************************************************************
233 my ($x0,$x1,$y0,$y1,$t0)=@_;
246 sub Mul_GF4_N { # not used, see next subroutine
247 # multiply and scale by N
248 my ($x0,$x1,$y0,$y1,$t0)=@_;
262 # interleaved Mul_GF4_N and Mul_GF4
263 my ($x0,$x1,$y0,$y1,$t0,
264 $x2,$x3,$y2,$y3,$t1)=@_;
292 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
299 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
300 @x[2], @x[3], @y[2], @y[3], @t[2]);
312 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
313 @x[6], @x[7], @y[2], @y[3], @t[2]);
318 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
327 #;********************************************************************
328 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
329 #;********************************************************************
333 # direct optimizations from hardware
388 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
390 # new smaller inversion
424 # output in s3, s2, s1, t1
426 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
428 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
429 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
431 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
434 # AES linear components
440 pxor 0x00($key),@x[0]
441 pxor 0x10($key),@x[1]
442 pxor 0x20($key),@x[2]
443 pxor 0x30($key),@x[3]
446 pxor 0x40($key),@x[4]
447 pxor 0x50($key),@x[5]
450 pxor 0x60($key),@x[6]
451 pxor 0x70($key),@x[7]
461 # modified to emit output in order suitable for feeding back to aesenc[last]
464 my $inv=@_[16]; # optional
466 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
467 pshufd \$0x93, @x[1], @t[1]
468 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
469 pshufd \$0x93, @x[2], @t[2]
471 pshufd \$0x93, @x[3], @t[3]
473 pshufd \$0x93, @x[4], @t[4]
475 pshufd \$0x93, @x[5], @t[5]
477 pshufd \$0x93, @x[6], @t[6]
479 pshufd \$0x93, @x[7], @t[7]
486 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
488 pshufd \$0x4E, @x[1], @x[1]
494 pshufd \$0x4E, @x[4], @t[0]
496 pshufd \$0x4E, @x[5], @t[1]
498 pshufd \$0x4E, @x[3], @x[4]
500 pshufd \$0x4E, @x[7], @x[5]
502 pshufd \$0x4E, @x[6], @x[3]
504 pshufd \$0x4E, @x[2], @x[6]
507 $code.=<<___ if (!$inv);
515 $code.=<<___ if ($inv);
528 sub InvMixColumns_orig {
533 # multiplication by 0x0e
534 pshufd \$0x93, @x[7], @t[7]
536 pxor @x[5], @x[7] # 7 5
537 pxor @x[5], @x[2] # 2 5
538 pshufd \$0x93, @x[0], @t[0]
540 pxor @x[0], @x[5] # 5 0 [1]
541 pxor @x[1], @x[0] # 0 1
542 pshufd \$0x93, @x[1], @t[1]
543 pxor @x[2], @x[1] # 1 25
544 pxor @x[6], @x[0] # 01 6 [2]
545 pxor @x[3], @x[1] # 125 3 [4]
546 pshufd \$0x93, @x[3], @t[3]
547 pxor @x[0], @x[2] # 25 016 [3]
548 pxor @x[7], @x[3] # 3 75
549 pxor @x[6], @x[7] # 75 6 [0]
550 pshufd \$0x93, @x[6], @t[6]
552 pxor @x[4], @x[6] # 6 4
553 pxor @x[3], @x[4] # 4 375 [6]
554 pxor @x[7], @x[3] # 375 756=36
555 pxor @t[5], @x[6] # 64 5 [7]
556 pxor @t[2], @x[3] # 36 2
557 pxor @t[4], @x[3] # 362 4 [5]
558 pshufd \$0x93, @t[5], @t[5]
560 my @y = @x[7,5,0,2,1,3,4,6];
562 # multiplication by 0x0b
566 pshufd \$0x93, @t[2], @t[2]
570 pshufd \$0x93, @t[4], @t[4]
571 pxor @t[6], @t[7] # clobber t[7]
575 pshufd \$0x93, @t[0], @t[0]
579 pshufd \$0x93, @t[1], @t[1]
583 pshufd \$0x93, @t[2], @t[2]
587 pshufd \$0x93, @t[3], @t[3]
593 pxor @t[5], @t[7] # clobber t[7] even more
596 pshufd \$0x93, @t[4], @t[4]
601 pshufd \$0x93, @t[5], @t[5]
602 pxor @t[6], @t[7] # restore t[7]
604 # multiplication by 0x0d
607 pshufd \$0x93, @t[6], @t[6]
611 pshufd \$0x93, @t[7], @t[7]
620 pshufd \$0x93, @t[0], @t[0]
624 pshufd \$0x93, @t[1], @t[1]
629 pshufd \$0x93, @t[2], @t[2]
631 pxor @t[3], @t[6] # clobber t[6]
638 pshufd \$0x93, @t[4], @t[4]
641 pxor @t[3], @t[6] # restore t[6]
643 pshufd \$0x93, @t[5], @t[5]
644 pshufd \$0x93, @t[6], @t[6]
645 pshufd \$0x93, @t[7], @t[7]
646 pshufd \$0x93, @t[3], @t[3]
648 # multiplication by 0x09
650 pxor @y[1], @t[1] # t[1]=y[1]
651 pxor @t[5], @t[0] # clobber t[0]
654 pxor @y[0], @t[0] # t[0]=y[0]
656 pxor @t[7], @t[6] # clobber t[6]
659 pxor @y[4], @t[4] # t[4]=y[4]
661 pxor @y[3], @t[3] # t[3]=y[3]
663 pxor @y[2], @t[2] # t[2]=y[2]
665 pxor @y[5], @t[5] # t[5]=y[5]
668 pxor @y[6], @t[6] # t[6]=y[6]
669 pxor @y[7], @t[7] # t[7]=y[7]
686 # Thanks to Jussi Kivilinna for providing pointer to
688 # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
689 # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
690 # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
691 # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
694 # multiplication by 0x05-0x00-0x04-0x00
695 pshufd \$0x4E, @x[0], @t[0]
696 pshufd \$0x4E, @x[6], @t[6]
698 pshufd \$0x4E, @x[7], @t[7]
700 pshufd \$0x4E, @x[1], @t[1]
702 pshufd \$0x4E, @x[2], @t[2]
704 pshufd \$0x4E, @x[3], @t[3]
708 pshufd \$0x4E, @x[4], @t[4]
712 pshufd \$0x4E, @x[5], @t[5]
727 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
730 sub aesenc { # not used
734 movdqa 0x30($const),@t[0] # .LSR
736 &ShiftRows (@b,@t[0]);
738 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
741 sub aesenclast { # not used
745 movdqa 0x40($const),@t[0] # .LSRM0
747 &ShiftRows (@b,@t[0]);
750 pxor 0x00($key),@b[0]
751 pxor 0x10($key),@b[1]
752 pxor 0x20($key),@b[4]
753 pxor 0x30($key),@b[6]
754 pxor 0x40($key),@b[3]
755 pxor 0x50($key),@b[7]
756 pxor 0x60($key),@b[2]
757 pxor 0x70($key),@b[5]
762 my ($a,$b,$n,$mask,$t)=@_;
774 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
794 my @x=reverse(@_[0..7]);
795 my ($t0,$t1,$t2,$t3)=@_[8..11];
797 movdqa 0x00($const),$t0 # .LBS0
798 movdqa 0x10($const),$t1 # .LBS1
800 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
801 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
803 movdqa 0x20($const),$t0 # .LBS2
805 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
806 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
808 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
809 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
815 .extern asm_AES_encrypt
816 .extern asm_AES_decrypt
818 .type _bsaes_encrypt8,\@abi-omnipotent
822 lea .LBS0(%rip), $const # constants table
824 movdqa ($key), @XMM[9] # round 0 key
826 movdqa 0x50($const), @XMM[8] # .LM0SR
827 pxor @XMM[9], @XMM[0] # xor with round0 key
828 pxor @XMM[9], @XMM[1]
829 pxor @XMM[9], @XMM[2]
830 pxor @XMM[9], @XMM[3]
831 pshufb @XMM[8], @XMM[0]
832 pshufb @XMM[8], @XMM[1]
833 pxor @XMM[9], @XMM[4]
834 pxor @XMM[9], @XMM[5]
835 pshufb @XMM[8], @XMM[2]
836 pshufb @XMM[8], @XMM[3]
837 pxor @XMM[9], @XMM[6]
838 pxor @XMM[9], @XMM[7]
839 pshufb @XMM[8], @XMM[4]
840 pshufb @XMM[8], @XMM[5]
841 pshufb @XMM[8], @XMM[6]
842 pshufb @XMM[8], @XMM[7]
843 _bsaes_encrypt8_bitslice:
845 &bitslice (@XMM[0..7, 8..11]);
852 &ShiftRows (@XMM[0..7, 8]);
853 $code.=".Lenc_sbox:\n";
854 &Sbox (@XMM[0..7, 8..15]);
859 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
861 movdqa 0x30($const), @XMM[8] # .LSR
863 movdqa 0x40($const), @XMM[8] # .LSRM0
868 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
869 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
871 movdqa ($key), @XMM[8] # last round key
872 pxor @XMM[8], @XMM[4]
873 pxor @XMM[8], @XMM[6]
874 pxor @XMM[8], @XMM[3]
875 pxor @XMM[8], @XMM[7]
876 pxor @XMM[8], @XMM[2]
877 pxor @XMM[8], @XMM[5]
878 pxor @XMM[8], @XMM[0]
879 pxor @XMM[8], @XMM[1]
882 .size _bsaes_encrypt8,.-_bsaes_encrypt8
884 .type _bsaes_decrypt8,\@abi-omnipotent
888 lea .LBS0(%rip), $const # constants table
890 movdqa ($key), @XMM[9] # round 0 key
892 movdqa -0x30($const), @XMM[8] # .LM0ISR
893 pxor @XMM[9], @XMM[0] # xor with round0 key
894 pxor @XMM[9], @XMM[1]
895 pxor @XMM[9], @XMM[2]
896 pxor @XMM[9], @XMM[3]
897 pshufb @XMM[8], @XMM[0]
898 pshufb @XMM[8], @XMM[1]
899 pxor @XMM[9], @XMM[4]
900 pxor @XMM[9], @XMM[5]
901 pshufb @XMM[8], @XMM[2]
902 pshufb @XMM[8], @XMM[3]
903 pxor @XMM[9], @XMM[6]
904 pxor @XMM[9], @XMM[7]
905 pshufb @XMM[8], @XMM[4]
906 pshufb @XMM[8], @XMM[5]
907 pshufb @XMM[8], @XMM[6]
908 pshufb @XMM[8], @XMM[7]
910 &bitslice (@XMM[0..7, 8..11]);
917 &ShiftRows (@XMM[0..7, 8]);
918 $code.=".Ldec_sbox:\n";
919 &InvSbox (@XMM[0..7, 8..15]);
924 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
926 movdqa -0x10($const), @XMM[8] # .LISR
928 movdqa -0x20($const), @XMM[8] # .LISRM0
933 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
935 movdqa ($key), @XMM[8] # last round key
936 pxor @XMM[8], @XMM[6]
937 pxor @XMM[8], @XMM[4]
938 pxor @XMM[8], @XMM[2]
939 pxor @XMM[8], @XMM[7]
940 pxor @XMM[8], @XMM[3]
941 pxor @XMM[8], @XMM[5]
942 pxor @XMM[8], @XMM[0]
943 pxor @XMM[8], @XMM[1]
946 .size _bsaes_decrypt8,.-_bsaes_decrypt8
950 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
953 my @x=reverse(@_[0..7]);
954 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
956 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
958 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
962 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
964 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
966 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
972 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
973 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
977 .type _bsaes_key_convert,\@abi-omnipotent
981 lea .Lmasks(%rip), $const
982 movdqu ($inp), %xmm7 # load round 0 key
984 movdqa 0x00($const), %xmm0 # 0x01...
985 movdqa 0x10($const), %xmm1 # 0x02...
986 movdqa 0x20($const), %xmm2 # 0x04...
987 movdqa 0x30($const), %xmm3 # 0x08...
988 movdqa 0x40($const), %xmm4 # .LM0
989 pcmpeqd %xmm5, %xmm5 # .LNOT
991 movdqu ($inp), %xmm6 # load round 1 key
992 movdqa %xmm7, ($out) # save round 0 key
998 pshufb %xmm4, %xmm6 # .LM0
1005 movdqa %xmm2, %xmm10
1006 pcmpeqb %xmm0, %xmm8
1007 psllq \$4, %xmm0 # 0x10...
1008 movdqa %xmm3, %xmm11
1009 pcmpeqb %xmm1, %xmm9
1010 psllq \$4, %xmm1 # 0x20...
1014 movdqa %xmm0, %xmm12
1015 pcmpeqb %xmm2, %xmm10
1016 psllq \$4, %xmm2 # 0x40...
1017 movdqa %xmm1, %xmm13
1018 pcmpeqb %xmm3, %xmm11
1019 psllq \$4, %xmm3 # 0x80...
1021 movdqa %xmm2, %xmm14
1022 movdqa %xmm3, %xmm15
1023 pxor %xmm5, %xmm8 # "pnot"
1028 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1029 pcmpeqb %xmm0, %xmm12
1030 psrlq \$4, %xmm0 # 0x01...
1031 movdqa %xmm9, 0x10($out)
1032 pcmpeqb %xmm1, %xmm13
1033 psrlq \$4, %xmm1 # 0x02...
1034 lea 0x10($inp), $inp
1038 movdqa %xmm10, 0x20($out)
1039 pcmpeqb %xmm2, %xmm14
1040 psrlq \$4, %xmm2 # 0x04...
1041 movdqa %xmm11, 0x30($out)
1042 pcmpeqb %xmm3, %xmm15
1043 psrlq \$4, %xmm3 # 0x08...
1044 movdqu ($inp), %xmm6 # load next round key
1046 pxor %xmm5, %xmm13 # "pnot"
1048 movdqa %xmm12, 0x40($out)
1049 movdqa %xmm13, 0x50($out)
1050 movdqa %xmm14, 0x60($out)
1051 movdqa %xmm15, 0x70($out)
1056 movdqa 0x50($const), %xmm7 # .L63
1057 #movdqa %xmm6, ($out) # don't save last round key
1060 .size _bsaes_key_convert,.-_bsaes_key_convert
1064 if (0 && !$win64) { # following four functions are unsupported interface
1065 # used for benchmarking...
1067 .globl bsaes_enc_key_convert
1068 .type bsaes_enc_key_convert,\@function,2
1070 bsaes_enc_key_convert:
1071 mov 240($inp),%r10d # pass rounds
1072 mov $inp,%rcx # pass key
1073 mov $out,%rax # pass key schedule
1074 call _bsaes_key_convert
1075 pxor %xmm6,%xmm7 # fix up last round key
1076 movdqa %xmm7,(%rax) # save last round key
1078 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1080 .globl bsaes_encrypt_128
1081 .type bsaes_encrypt_128,\@function,4
1085 movdqu 0x00($inp), @XMM[0] # load input
1086 movdqu 0x10($inp), @XMM[1]
1087 movdqu 0x20($inp), @XMM[2]
1088 movdqu 0x30($inp), @XMM[3]
1089 movdqu 0x40($inp), @XMM[4]
1090 movdqu 0x50($inp), @XMM[5]
1091 movdqu 0x60($inp), @XMM[6]
1092 movdqu 0x70($inp), @XMM[7]
1093 mov $key, %rax # pass the $key
1094 lea 0x80($inp), $inp
1097 call _bsaes_encrypt8
1099 movdqu @XMM[0], 0x00($out) # write output
1100 movdqu @XMM[1], 0x10($out)
1101 movdqu @XMM[4], 0x20($out)
1102 movdqu @XMM[6], 0x30($out)
1103 movdqu @XMM[3], 0x40($out)
1104 movdqu @XMM[7], 0x50($out)
1105 movdqu @XMM[2], 0x60($out)
1106 movdqu @XMM[5], 0x70($out)
1107 lea 0x80($out), $out
1111 .size bsaes_encrypt_128,.-bsaes_encrypt_128
1113 .globl bsaes_dec_key_convert
1114 .type bsaes_dec_key_convert,\@function,2
1116 bsaes_dec_key_convert:
1117 mov 240($inp),%r10d # pass rounds
1118 mov $inp,%rcx # pass key
1119 mov $out,%rax # pass key schedule
1120 call _bsaes_key_convert
1121 pxor ($out),%xmm7 # fix up round 0 key
1122 movdqa %xmm6,(%rax) # save last round key
1125 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1127 .globl bsaes_decrypt_128
1128 .type bsaes_decrypt_128,\@function,4
1132 movdqu 0x00($inp), @XMM[0] # load input
1133 movdqu 0x10($inp), @XMM[1]
1134 movdqu 0x20($inp), @XMM[2]
1135 movdqu 0x30($inp), @XMM[3]
1136 movdqu 0x40($inp), @XMM[4]
1137 movdqu 0x50($inp), @XMM[5]
1138 movdqu 0x60($inp), @XMM[6]
1139 movdqu 0x70($inp), @XMM[7]
1140 mov $key, %rax # pass the $key
1141 lea 0x80($inp), $inp
1144 call _bsaes_decrypt8
1146 movdqu @XMM[0], 0x00($out) # write output
1147 movdqu @XMM[1], 0x10($out)
1148 movdqu @XMM[6], 0x20($out)
1149 movdqu @XMM[4], 0x30($out)
1150 movdqu @XMM[2], 0x40($out)
1151 movdqu @XMM[7], 0x50($out)
1152 movdqu @XMM[3], 0x60($out)
1153 movdqu @XMM[5], 0x70($out)
1154 lea 0x80($out), $out
1158 .size bsaes_decrypt_128,.-bsaes_decrypt_128
1162 ######################################################################
1166 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1167 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1168 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1172 .globl bsaes_ecb_encrypt_blocks
1173 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1175 bsaes_ecb_encrypt_blocks:
1191 lea -0x48(%rsp),%rsp
1192 .cfi_adjust_cfa_offset 0x48
1194 $code.=<<___ if ($win64);
1195 lea -0xa0(%rsp), %rsp
1196 movaps %xmm6, 0x40(%rsp)
1197 movaps %xmm7, 0x50(%rsp)
1198 movaps %xmm8, 0x60(%rsp)
1199 movaps %xmm9, 0x70(%rsp)
1200 movaps %xmm10, 0x80(%rsp)
1201 movaps %xmm11, 0x90(%rsp)
1202 movaps %xmm12, 0xa0(%rsp)
1203 movaps %xmm13, 0xb0(%rsp)
1204 movaps %xmm14, 0xc0(%rsp)
1205 movaps %xmm15, 0xd0(%rsp)
1209 mov %rsp,%rbp # backup %rsp
1210 .cfi_def_cfa_register %rbp
1211 mov 240($arg4),%eax # rounds
1212 mov $arg1,$inp # backup arguments
1219 mov %eax,%ebx # backup rounds
1220 shl \$7,%rax # 128 bytes per inner round key
1221 sub \$`128-32`,%rax # size of bit-sliced key schedule
1223 mov %rsp,%rax # pass key schedule
1224 mov $key,%rcx # pass key
1225 mov %ebx,%r10d # pass rounds
1226 call _bsaes_key_convert
1227 pxor %xmm6,%xmm7 # fix up last round key
1228 movdqa %xmm7,(%rax) # save last round key
1232 movdqu 0x00($inp), @XMM[0] # load input
1233 movdqu 0x10($inp), @XMM[1]
1234 movdqu 0x20($inp), @XMM[2]
1235 movdqu 0x30($inp), @XMM[3]
1236 movdqu 0x40($inp), @XMM[4]
1237 movdqu 0x50($inp), @XMM[5]
1238 mov %rsp, %rax # pass key schedule
1239 movdqu 0x60($inp), @XMM[6]
1240 mov %ebx,%r10d # pass rounds
1241 movdqu 0x70($inp), @XMM[7]
1242 lea 0x80($inp), $inp
1244 call _bsaes_encrypt8
1246 movdqu @XMM[0], 0x00($out) # write output
1247 movdqu @XMM[1], 0x10($out)
1248 movdqu @XMM[4], 0x20($out)
1249 movdqu @XMM[6], 0x30($out)
1250 movdqu @XMM[3], 0x40($out)
1251 movdqu @XMM[7], 0x50($out)
1252 movdqu @XMM[2], 0x60($out)
1253 movdqu @XMM[5], 0x70($out)
1254 lea 0x80($out), $out
1261 movdqu 0x00($inp), @XMM[0] # load input
1262 mov %rsp, %rax # pass key schedule
1263 mov %ebx,%r10d # pass rounds
1266 movdqu 0x10($inp), @XMM[1]
1268 movdqu 0x20($inp), @XMM[2]
1271 movdqu 0x30($inp), @XMM[3]
1273 movdqu 0x40($inp), @XMM[4]
1276 movdqu 0x50($inp), @XMM[5]
1278 movdqu 0x60($inp), @XMM[6]
1279 call _bsaes_encrypt8
1280 movdqu @XMM[0], 0x00($out) # write output
1281 movdqu @XMM[1], 0x10($out)
1282 movdqu @XMM[4], 0x20($out)
1283 movdqu @XMM[6], 0x30($out)
1284 movdqu @XMM[3], 0x40($out)
1285 movdqu @XMM[7], 0x50($out)
1286 movdqu @XMM[2], 0x60($out)
1290 call _bsaes_encrypt8
1291 movdqu @XMM[0], 0x00($out) # write output
1292 movdqu @XMM[1], 0x10($out)
1293 movdqu @XMM[4], 0x20($out)
1294 movdqu @XMM[6], 0x30($out)
1295 movdqu @XMM[3], 0x40($out)
1296 movdqu @XMM[7], 0x50($out)
1300 call _bsaes_encrypt8
1301 movdqu @XMM[0], 0x00($out) # write output
1302 movdqu @XMM[1], 0x10($out)
1303 movdqu @XMM[4], 0x20($out)
1304 movdqu @XMM[6], 0x30($out)
1305 movdqu @XMM[3], 0x40($out)
1309 call _bsaes_encrypt8
1310 movdqu @XMM[0], 0x00($out) # write output
1311 movdqu @XMM[1], 0x10($out)
1312 movdqu @XMM[4], 0x20($out)
1313 movdqu @XMM[6], 0x30($out)
1317 call _bsaes_encrypt8
1318 movdqu @XMM[0], 0x00($out) # write output
1319 movdqu @XMM[1], 0x10($out)
1320 movdqu @XMM[4], 0x20($out)
1324 call _bsaes_encrypt8
1325 movdqu @XMM[0], 0x00($out) # write output
1326 movdqu @XMM[1], 0x10($out)
1330 call _bsaes_encrypt8
1331 movdqu @XMM[0], 0x00($out) # write output
1338 call asm_AES_encrypt
1347 .Lecb_enc_bzero: # wipe key schedule [if any]
1348 movdqa %xmm0, 0x00(%rax)
1349 movdqa %xmm0, 0x10(%rax)
1350 lea 0x20(%rax), %rax
1357 $code.=<<___ if ($win64);
1358 movaps 0x40(%rbp), %xmm6
1359 movaps 0x50(%rbp), %xmm7
1360 movaps 0x60(%rbp), %xmm8
1361 movaps 0x70(%rbp), %xmm9
1362 movaps 0x80(%rbp), %xmm10
1363 movaps 0x90(%rbp), %xmm11
1364 movaps 0xa0(%rbp), %xmm12
1365 movaps 0xb0(%rbp), %xmm13
1366 movaps 0xc0(%rbp), %xmm14
1367 movaps 0xd0(%rbp), %xmm15
1368 lea 0xa0(%rax), %rax
1384 lea (%rax), %rsp # restore %rsp
1385 .cfi_def_cfa_register %rsp
1389 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1391 .globl bsaes_ecb_decrypt_blocks
1392 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1394 bsaes_ecb_decrypt_blocks:
1410 lea -0x48(%rsp),%rsp
1411 .cfi_adjust_cfa_offset 0x48
1413 $code.=<<___ if ($win64);
1414 lea -0xa0(%rsp), %rsp
1415 movaps %xmm6, 0x40(%rsp)
1416 movaps %xmm7, 0x50(%rsp)
1417 movaps %xmm8, 0x60(%rsp)
1418 movaps %xmm9, 0x70(%rsp)
1419 movaps %xmm10, 0x80(%rsp)
1420 movaps %xmm11, 0x90(%rsp)
1421 movaps %xmm12, 0xa0(%rsp)
1422 movaps %xmm13, 0xb0(%rsp)
1423 movaps %xmm14, 0xc0(%rsp)
1424 movaps %xmm15, 0xd0(%rsp)
1428 mov %rsp,%rbp # backup %rsp
1429 .cfi_def_cfa_register %rbp
1430 mov 240($arg4),%eax # rounds
1431 mov $arg1,$inp # backup arguments
1438 mov %eax,%ebx # backup rounds
1439 shl \$7,%rax # 128 bytes per inner round key
1440 sub \$`128-32`,%rax # size of bit-sliced key schedule
1442 mov %rsp,%rax # pass key schedule
1443 mov $key,%rcx # pass key
1444 mov %ebx,%r10d # pass rounds
1445 call _bsaes_key_convert
1446 pxor (%rsp),%xmm7 # fix up 0 round key
1447 movdqa %xmm6,(%rax) # save last round key
1452 movdqu 0x00($inp), @XMM[0] # load input
1453 movdqu 0x10($inp), @XMM[1]
1454 movdqu 0x20($inp), @XMM[2]
1455 movdqu 0x30($inp), @XMM[3]
1456 movdqu 0x40($inp), @XMM[4]
1457 movdqu 0x50($inp), @XMM[5]
1458 mov %rsp, %rax # pass key schedule
1459 movdqu 0x60($inp), @XMM[6]
1460 mov %ebx,%r10d # pass rounds
1461 movdqu 0x70($inp), @XMM[7]
1462 lea 0x80($inp), $inp
1464 call _bsaes_decrypt8
1466 movdqu @XMM[0], 0x00($out) # write output
1467 movdqu @XMM[1], 0x10($out)
1468 movdqu @XMM[6], 0x20($out)
1469 movdqu @XMM[4], 0x30($out)
1470 movdqu @XMM[2], 0x40($out)
1471 movdqu @XMM[7], 0x50($out)
1472 movdqu @XMM[3], 0x60($out)
1473 movdqu @XMM[5], 0x70($out)
1474 lea 0x80($out), $out
1481 movdqu 0x00($inp), @XMM[0] # load input
1482 mov %rsp, %rax # pass key schedule
1483 mov %ebx,%r10d # pass rounds
1486 movdqu 0x10($inp), @XMM[1]
1488 movdqu 0x20($inp), @XMM[2]
1491 movdqu 0x30($inp), @XMM[3]
1493 movdqu 0x40($inp), @XMM[4]
1496 movdqu 0x50($inp), @XMM[5]
1498 movdqu 0x60($inp), @XMM[6]
1499 call _bsaes_decrypt8
1500 movdqu @XMM[0], 0x00($out) # write output
1501 movdqu @XMM[1], 0x10($out)
1502 movdqu @XMM[6], 0x20($out)
1503 movdqu @XMM[4], 0x30($out)
1504 movdqu @XMM[2], 0x40($out)
1505 movdqu @XMM[7], 0x50($out)
1506 movdqu @XMM[3], 0x60($out)
1510 call _bsaes_decrypt8
1511 movdqu @XMM[0], 0x00($out) # write output
1512 movdqu @XMM[1], 0x10($out)
1513 movdqu @XMM[6], 0x20($out)
1514 movdqu @XMM[4], 0x30($out)
1515 movdqu @XMM[2], 0x40($out)
1516 movdqu @XMM[7], 0x50($out)
1520 call _bsaes_decrypt8
1521 movdqu @XMM[0], 0x00($out) # write output
1522 movdqu @XMM[1], 0x10($out)
1523 movdqu @XMM[6], 0x20($out)
1524 movdqu @XMM[4], 0x30($out)
1525 movdqu @XMM[2], 0x40($out)
1529 call _bsaes_decrypt8
1530 movdqu @XMM[0], 0x00($out) # write output
1531 movdqu @XMM[1], 0x10($out)
1532 movdqu @XMM[6], 0x20($out)
1533 movdqu @XMM[4], 0x30($out)
1537 call _bsaes_decrypt8
1538 movdqu @XMM[0], 0x00($out) # write output
1539 movdqu @XMM[1], 0x10($out)
1540 movdqu @XMM[6], 0x20($out)
1544 call _bsaes_decrypt8
1545 movdqu @XMM[0], 0x00($out) # write output
1546 movdqu @XMM[1], 0x10($out)
1550 call _bsaes_decrypt8
1551 movdqu @XMM[0], 0x00($out) # write output
1558 call asm_AES_decrypt
1567 .Lecb_dec_bzero: # wipe key schedule [if any]
1568 movdqa %xmm0, 0x00(%rax)
1569 movdqa %xmm0, 0x10(%rax)
1570 lea 0x20(%rax), %rax
1577 $code.=<<___ if ($win64);
1578 movaps 0x40(%rbp), %xmm6
1579 movaps 0x50(%rbp), %xmm7
1580 movaps 0x60(%rbp), %xmm8
1581 movaps 0x70(%rbp), %xmm9
1582 movaps 0x80(%rbp), %xmm10
1583 movaps 0x90(%rbp), %xmm11
1584 movaps 0xa0(%rbp), %xmm12
1585 movaps 0xb0(%rbp), %xmm13
1586 movaps 0xc0(%rbp), %xmm14
1587 movaps 0xd0(%rbp), %xmm15
1588 lea 0xa0(%rax), %rax
1604 lea (%rax), %rsp # restore %rsp
1605 .cfi_def_cfa_register %rsp
1609 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1613 .extern asm_AES_cbc_encrypt
1614 .globl bsaes_cbc_encrypt
1615 .type bsaes_cbc_encrypt,\@abi-omnipotent
1621 $code.=<<___ if ($win64);
1622 mov 48(%rsp),$arg6 # pull direction flag
1626 jne asm_AES_cbc_encrypt
1628 jb asm_AES_cbc_encrypt
1644 lea -0x48(%rsp), %rsp
1645 .cfi_adjust_cfa_offset 0x48
1647 $code.=<<___ if ($win64);
1648 mov 0xa0(%rsp),$arg5 # pull ivp
1649 lea -0xa0(%rsp), %rsp
1650 movaps %xmm6, 0x40(%rsp)
1651 movaps %xmm7, 0x50(%rsp)
1652 movaps %xmm8, 0x60(%rsp)
1653 movaps %xmm9, 0x70(%rsp)
1654 movaps %xmm10, 0x80(%rsp)
1655 movaps %xmm11, 0x90(%rsp)
1656 movaps %xmm12, 0xa0(%rsp)
1657 movaps %xmm13, 0xb0(%rsp)
1658 movaps %xmm14, 0xc0(%rsp)
1659 movaps %xmm15, 0xd0(%rsp)
1663 mov %rsp, %rbp # backup %rsp
1664 .cfi_def_cfa_register %rbp
1665 mov 240($arg4), %eax # rounds
1666 mov $arg1, $inp # backup arguments
1671 shr \$4, $len # bytes to blocks
1673 mov %eax, %edx # rounds
1674 shl \$7, %rax # 128 bytes per inner round key
1675 sub \$`128-32`, %rax # size of bit-sliced key schedule
1678 mov %rsp, %rax # pass key schedule
1679 mov $key, %rcx # pass key
1680 mov %edx, %r10d # pass rounds
1681 call _bsaes_key_convert
1682 pxor (%rsp),%xmm7 # fix up 0 round key
1683 movdqa %xmm6,(%rax) # save last round key
1686 movdqu (%rbx), @XMM[15] # load IV
1689 movdqu 0x00($inp), @XMM[0] # load input
1690 movdqu 0x10($inp), @XMM[1]
1691 movdqu 0x20($inp), @XMM[2]
1692 movdqu 0x30($inp), @XMM[3]
1693 movdqu 0x40($inp), @XMM[4]
1694 movdqu 0x50($inp), @XMM[5]
1695 mov %rsp, %rax # pass key schedule
1696 movdqu 0x60($inp), @XMM[6]
1697 mov %edx,%r10d # pass rounds
1698 movdqu 0x70($inp), @XMM[7]
1699 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1701 call _bsaes_decrypt8
1703 pxor 0x20(%rbp), @XMM[0] # ^= IV
1704 movdqu 0x00($inp), @XMM[8] # re-load input
1705 movdqu 0x10($inp), @XMM[9]
1706 pxor @XMM[8], @XMM[1]
1707 movdqu 0x20($inp), @XMM[10]
1708 pxor @XMM[9], @XMM[6]
1709 movdqu 0x30($inp), @XMM[11]
1710 pxor @XMM[10], @XMM[4]
1711 movdqu 0x40($inp), @XMM[12]
1712 pxor @XMM[11], @XMM[2]
1713 movdqu 0x50($inp), @XMM[13]
1714 pxor @XMM[12], @XMM[7]
1715 movdqu 0x60($inp), @XMM[14]
1716 pxor @XMM[13], @XMM[3]
1717 movdqu 0x70($inp), @XMM[15] # IV
1718 pxor @XMM[14], @XMM[5]
1719 movdqu @XMM[0], 0x00($out) # write output
1720 lea 0x80($inp), $inp
1721 movdqu @XMM[1], 0x10($out)
1722 movdqu @XMM[6], 0x20($out)
1723 movdqu @XMM[4], 0x30($out)
1724 movdqu @XMM[2], 0x40($out)
1725 movdqu @XMM[7], 0x50($out)
1726 movdqu @XMM[3], 0x60($out)
1727 movdqu @XMM[5], 0x70($out)
1728 lea 0x80($out), $out
1735 movdqu 0x00($inp), @XMM[0] # load input
1736 mov %rsp, %rax # pass key schedule
1737 mov %edx, %r10d # pass rounds
1740 movdqu 0x10($inp), @XMM[1]
1742 movdqu 0x20($inp), @XMM[2]
1745 movdqu 0x30($inp), @XMM[3]
1747 movdqu 0x40($inp), @XMM[4]
1750 movdqu 0x50($inp), @XMM[5]
1752 movdqu 0x60($inp), @XMM[6]
1753 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1754 call _bsaes_decrypt8
1755 pxor 0x20(%rbp), @XMM[0] # ^= IV
1756 movdqu 0x00($inp), @XMM[8] # re-load input
1757 movdqu 0x10($inp), @XMM[9]
1758 pxor @XMM[8], @XMM[1]
1759 movdqu 0x20($inp), @XMM[10]
1760 pxor @XMM[9], @XMM[6]
1761 movdqu 0x30($inp), @XMM[11]
1762 pxor @XMM[10], @XMM[4]
1763 movdqu 0x40($inp), @XMM[12]
1764 pxor @XMM[11], @XMM[2]
1765 movdqu 0x50($inp), @XMM[13]
1766 pxor @XMM[12], @XMM[7]
1767 movdqu 0x60($inp), @XMM[15] # IV
1768 pxor @XMM[13], @XMM[3]
1769 movdqu @XMM[0], 0x00($out) # write output
1770 movdqu @XMM[1], 0x10($out)
1771 movdqu @XMM[6], 0x20($out)
1772 movdqu @XMM[4], 0x30($out)
1773 movdqu @XMM[2], 0x40($out)
1774 movdqu @XMM[7], 0x50($out)
1775 movdqu @XMM[3], 0x60($out)
1779 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1780 call _bsaes_decrypt8
1781 pxor 0x20(%rbp), @XMM[0] # ^= IV
1782 movdqu 0x00($inp), @XMM[8] # re-load input
1783 movdqu 0x10($inp), @XMM[9]
1784 pxor @XMM[8], @XMM[1]
1785 movdqu 0x20($inp), @XMM[10]
1786 pxor @XMM[9], @XMM[6]
1787 movdqu 0x30($inp), @XMM[11]
1788 pxor @XMM[10], @XMM[4]
1789 movdqu 0x40($inp), @XMM[12]
1790 pxor @XMM[11], @XMM[2]
1791 movdqu 0x50($inp), @XMM[15] # IV
1792 pxor @XMM[12], @XMM[7]
1793 movdqu @XMM[0], 0x00($out) # write output
1794 movdqu @XMM[1], 0x10($out)
1795 movdqu @XMM[6], 0x20($out)
1796 movdqu @XMM[4], 0x30($out)
1797 movdqu @XMM[2], 0x40($out)
1798 movdqu @XMM[7], 0x50($out)
1802 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1803 call _bsaes_decrypt8
1804 pxor 0x20(%rbp), @XMM[0] # ^= IV
1805 movdqu 0x00($inp), @XMM[8] # re-load input
1806 movdqu 0x10($inp), @XMM[9]
1807 pxor @XMM[8], @XMM[1]
1808 movdqu 0x20($inp), @XMM[10]
1809 pxor @XMM[9], @XMM[6]
1810 movdqu 0x30($inp), @XMM[11]
1811 pxor @XMM[10], @XMM[4]
1812 movdqu 0x40($inp), @XMM[15] # IV
1813 pxor @XMM[11], @XMM[2]
1814 movdqu @XMM[0], 0x00($out) # write output
1815 movdqu @XMM[1], 0x10($out)
1816 movdqu @XMM[6], 0x20($out)
1817 movdqu @XMM[4], 0x30($out)
1818 movdqu @XMM[2], 0x40($out)
1822 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1823 call _bsaes_decrypt8
1824 pxor 0x20(%rbp), @XMM[0] # ^= IV
1825 movdqu 0x00($inp), @XMM[8] # re-load input
1826 movdqu 0x10($inp), @XMM[9]
1827 pxor @XMM[8], @XMM[1]
1828 movdqu 0x20($inp), @XMM[10]
1829 pxor @XMM[9], @XMM[6]
1830 movdqu 0x30($inp), @XMM[15] # IV
1831 pxor @XMM[10], @XMM[4]
1832 movdqu @XMM[0], 0x00($out) # write output
1833 movdqu @XMM[1], 0x10($out)
1834 movdqu @XMM[6], 0x20($out)
1835 movdqu @XMM[4], 0x30($out)
1839 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1840 call _bsaes_decrypt8
1841 pxor 0x20(%rbp), @XMM[0] # ^= IV
1842 movdqu 0x00($inp), @XMM[8] # re-load input
1843 movdqu 0x10($inp), @XMM[9]
1844 pxor @XMM[8], @XMM[1]
1845 movdqu 0x20($inp), @XMM[15] # IV
1846 pxor @XMM[9], @XMM[6]
1847 movdqu @XMM[0], 0x00($out) # write output
1848 movdqu @XMM[1], 0x10($out)
1849 movdqu @XMM[6], 0x20($out)
1853 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1854 call _bsaes_decrypt8
1855 pxor 0x20(%rbp), @XMM[0] # ^= IV
1856 movdqu 0x00($inp), @XMM[8] # re-load input
1857 movdqu 0x10($inp), @XMM[15] # IV
1858 pxor @XMM[8], @XMM[1]
1859 movdqu @XMM[0], 0x00($out) # write output
1860 movdqu @XMM[1], 0x10($out)
1865 lea 0x20(%rbp), $arg2 # buffer output
1867 call asm_AES_decrypt # doesn't touch %xmm
1868 pxor 0x20(%rbp), @XMM[15] # ^= IV
1869 movdqu @XMM[15], ($out) # write output
1870 movdqa @XMM[0], @XMM[15] # IV
1873 movdqu @XMM[15], (%rbx) # return IV
1876 .Lcbc_dec_bzero: # wipe key schedule [if any]
1877 movdqa %xmm0, 0x00(%rax)
1878 movdqa %xmm0, 0x10(%rax)
1879 lea 0x20(%rax), %rax
1886 $code.=<<___ if ($win64);
1887 movaps 0x40(%rbp), %xmm6
1888 movaps 0x50(%rbp), %xmm7
1889 movaps 0x60(%rbp), %xmm8
1890 movaps 0x70(%rbp), %xmm9
1891 movaps 0x80(%rbp), %xmm10
1892 movaps 0x90(%rbp), %xmm11
1893 movaps 0xa0(%rbp), %xmm12
1894 movaps 0xb0(%rbp), %xmm13
1895 movaps 0xc0(%rbp), %xmm14
1896 movaps 0xd0(%rbp), %xmm15
1897 lea 0xa0(%rax), %rax
1913 lea (%rax), %rsp # restore %rsp
1914 .cfi_def_cfa_register %rsp
1918 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1920 .globl bsaes_ctr32_encrypt_blocks
1921 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1923 bsaes_ctr32_encrypt_blocks:
1940 lea -0x48(%rsp), %rsp
1941 .cfi_adjust_cfa_offset 0x48
1943 $code.=<<___ if ($win64);
1944 mov 0xa0(%rsp),$arg5 # pull ivp
1945 lea -0xa0(%rsp), %rsp
1946 movaps %xmm6, 0x40(%rsp)
1947 movaps %xmm7, 0x50(%rsp)
1948 movaps %xmm8, 0x60(%rsp)
1949 movaps %xmm9, 0x70(%rsp)
1950 movaps %xmm10, 0x80(%rsp)
1951 movaps %xmm11, 0x90(%rsp)
1952 movaps %xmm12, 0xa0(%rsp)
1953 movaps %xmm13, 0xb0(%rsp)
1954 movaps %xmm14, 0xc0(%rsp)
1955 movaps %xmm15, 0xd0(%rsp)
1959 mov %rsp, %rbp # backup %rsp
1960 .cfi_def_cfa_register %rbp
1961 movdqu ($arg5), %xmm0 # load counter
1962 mov 240($arg4), %eax # rounds
1963 mov $arg1, $inp # backup arguments
1967 movdqa %xmm0, 0x20(%rbp) # copy counter
1971 mov %eax, %ebx # rounds
1972 shl \$7, %rax # 128 bytes per inner round key
1973 sub \$`128-32`, %rax # size of bit-sliced key schedule
1976 mov %rsp, %rax # pass key schedule
1977 mov $key, %rcx # pass key
1978 mov %ebx, %r10d # pass rounds
1979 call _bsaes_key_convert
1980 pxor %xmm6,%xmm7 # fix up last round key
1981 movdqa %xmm7,(%rax) # save last round key
1983 movdqa (%rsp), @XMM[9] # load round0 key
1984 lea .LADD1(%rip), %r11
1985 movdqa 0x20(%rbp), @XMM[0] # counter copy
1986 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1987 pshufb @XMM[8], @XMM[9] # byte swap upper part
1988 pshufb @XMM[8], @XMM[0]
1989 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1993 movdqa @XMM[0], 0x20(%rbp) # save counter
1994 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1995 movdqa @XMM[0], @XMM[2]
1996 paddd 0x00(%r11), @XMM[1] # .LADD1
1997 movdqa @XMM[0], @XMM[3]
1998 paddd 0x10(%r11), @XMM[2] # .LADD2
1999 movdqa @XMM[0], @XMM[4]
2000 paddd 0x20(%r11), @XMM[3] # .LADD3
2001 movdqa @XMM[0], @XMM[5]
2002 paddd 0x30(%r11), @XMM[4] # .LADD4
2003 movdqa @XMM[0], @XMM[6]
2004 paddd 0x40(%r11), @XMM[5] # .LADD5
2005 movdqa @XMM[0], @XMM[7]
2006 paddd 0x50(%r11), @XMM[6] # .LADD6
2007 paddd 0x60(%r11), @XMM[7] # .LADD7
2009 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
2010 # to flip byte order in 32-bit counter
2011 movdqa (%rsp), @XMM[9] # round 0 key
2012 lea 0x10(%rsp), %rax # pass key schedule
2013 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
2014 pxor @XMM[9], @XMM[0] # xor with round0 key
2015 pxor @XMM[9], @XMM[1]
2016 pxor @XMM[9], @XMM[2]
2017 pxor @XMM[9], @XMM[3]
2018 pshufb @XMM[8], @XMM[0]
2019 pshufb @XMM[8], @XMM[1]
2020 pxor @XMM[9], @XMM[4]
2021 pxor @XMM[9], @XMM[5]
2022 pshufb @XMM[8], @XMM[2]
2023 pshufb @XMM[8], @XMM[3]
2024 pxor @XMM[9], @XMM[6]
2025 pxor @XMM[9], @XMM[7]
2026 pshufb @XMM[8], @XMM[4]
2027 pshufb @XMM[8], @XMM[5]
2028 pshufb @XMM[8], @XMM[6]
2029 pshufb @XMM[8], @XMM[7]
2030 lea .LBS0(%rip), %r11 # constants table
2031 mov %ebx,%r10d # pass rounds
2033 call _bsaes_encrypt8_bitslice
2036 jc .Lctr_enc_loop_done
2038 movdqu 0x00($inp), @XMM[8] # load input
2039 movdqu 0x10($inp), @XMM[9]
2040 movdqu 0x20($inp), @XMM[10]
2041 movdqu 0x30($inp), @XMM[11]
2042 movdqu 0x40($inp), @XMM[12]
2043 movdqu 0x50($inp), @XMM[13]
2044 movdqu 0x60($inp), @XMM[14]
2045 movdqu 0x70($inp), @XMM[15]
2047 pxor @XMM[0], @XMM[8]
2048 movdqa 0x20(%rbp), @XMM[0] # load counter
2049 pxor @XMM[9], @XMM[1]
2050 movdqu @XMM[8], 0x00($out) # write output
2051 pxor @XMM[10], @XMM[4]
2052 movdqu @XMM[1], 0x10($out)
2053 pxor @XMM[11], @XMM[6]
2054 movdqu @XMM[4], 0x20($out)
2055 pxor @XMM[12], @XMM[3]
2056 movdqu @XMM[6], 0x30($out)
2057 pxor @XMM[13], @XMM[7]
2058 movdqu @XMM[3], 0x40($out)
2059 pxor @XMM[14], @XMM[2]
2060 movdqu @XMM[7], 0x50($out)
2061 pxor @XMM[15], @XMM[5]
2062 movdqu @XMM[2], 0x60($out)
2063 lea .LADD1(%rip), %r11
2064 movdqu @XMM[5], 0x70($out)
2065 lea 0x80($out), $out
2066 paddd 0x70(%r11), @XMM[0] # .LADD8
2071 .Lctr_enc_loop_done:
2073 movdqu 0x00($inp), @XMM[8] # load input
2074 pxor @XMM[8], @XMM[0]
2075 movdqu @XMM[0], 0x00($out) # write output
2078 movdqu 0x10($inp), @XMM[9]
2079 pxor @XMM[9], @XMM[1]
2080 movdqu @XMM[1], 0x10($out)
2082 movdqu 0x20($inp), @XMM[10]
2083 pxor @XMM[10], @XMM[4]
2084 movdqu @XMM[4], 0x20($out)
2087 movdqu 0x30($inp), @XMM[11]
2088 pxor @XMM[11], @XMM[6]
2089 movdqu @XMM[6], 0x30($out)
2091 movdqu 0x40($inp), @XMM[12]
2092 pxor @XMM[12], @XMM[3]
2093 movdqu @XMM[3], 0x40($out)
2096 movdqu 0x50($inp), @XMM[13]
2097 pxor @XMM[13], @XMM[7]
2098 movdqu @XMM[7], 0x50($out)
2100 movdqu 0x60($inp), @XMM[14]
2101 pxor @XMM[14], @XMM[2]
2102 movdqu @XMM[2], 0x60($out)
2107 lea 0x20(%rbp), $arg1
2108 lea 0x30(%rbp), $arg2
2110 call asm_AES_encrypt
2111 movdqu ($inp), @XMM[1]
2113 mov 0x2c(%rbp), %eax # load 32-bit counter
2115 pxor 0x30(%rbp), @XMM[1]
2116 inc %eax # increment
2117 movdqu @XMM[1], ($out)
2120 mov %eax, 0x2c(%rsp) # save 32-bit counter
2127 .Lctr_enc_bzero: # wipe key schedule [if any]
2128 movdqa %xmm0, 0x00(%rax)
2129 movdqa %xmm0, 0x10(%rax)
2130 lea 0x20(%rax), %rax
2137 $code.=<<___ if ($win64);
2138 movaps 0x40(%rbp), %xmm6
2139 movaps 0x50(%rbp), %xmm7
2140 movaps 0x60(%rbp), %xmm8
2141 movaps 0x70(%rbp), %xmm9
2142 movaps 0x80(%rbp), %xmm10
2143 movaps 0x90(%rbp), %xmm11
2144 movaps 0xa0(%rbp), %xmm12
2145 movaps 0xb0(%rbp), %xmm13
2146 movaps 0xc0(%rbp), %xmm14
2147 movaps 0xd0(%rbp), %xmm15
2148 lea 0xa0(%rax), %rax
2164 lea (%rax), %rsp # restore %rsp
2165 .cfi_def_cfa_register %rsp
2169 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2171 ######################################################################
2172 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2173 # const AES_KEY *key1, const AES_KEY *key2,
2174 # const unsigned char iv[16]);
2176 my ($twmask,$twres,$twtmp)=@XMM[13..15];
2180 .globl bsaes_xts_encrypt
2181 .type bsaes_xts_encrypt,\@abi-omnipotent
2199 lea -0x48(%rsp), %rsp
2200 .cfi_adjust_cfa_offset 0x48
2202 $code.=<<___ if ($win64);
2203 mov 0xa0(%rsp),$arg5 # pull key2
2204 mov 0xa8(%rsp),$arg6 # pull ivp
2205 lea -0xa0(%rsp), %rsp
2206 movaps %xmm6, 0x40(%rsp)
2207 movaps %xmm7, 0x50(%rsp)
2208 movaps %xmm8, 0x60(%rsp)
2209 movaps %xmm9, 0x70(%rsp)
2210 movaps %xmm10, 0x80(%rsp)
2211 movaps %xmm11, 0x90(%rsp)
2212 movaps %xmm12, 0xa0(%rsp)
2213 movaps %xmm13, 0xb0(%rsp)
2214 movaps %xmm14, 0xc0(%rsp)
2215 movaps %xmm15, 0xd0(%rsp)
2219 mov %rsp, %rbp # backup %rsp
2220 .cfi_def_cfa_register %rbp
2221 mov $arg1, $inp # backup arguments
2227 lea 0x20(%rbp), $arg2
2229 call asm_AES_encrypt # generate initial tweak
2231 mov 240($key), %eax # rounds
2232 mov $len, %rbx # backup $len
2234 mov %eax, %edx # rounds
2235 shl \$7, %rax # 128 bytes per inner round key
2236 sub \$`128-32`, %rax # size of bit-sliced key schedule
2239 mov %rsp, %rax # pass key schedule
2240 mov $key, %rcx # pass key
2241 mov %edx, %r10d # pass rounds
2242 call _bsaes_key_convert
2243 pxor %xmm6, %xmm7 # fix up last round key
2244 movdqa %xmm7, (%rax) # save last round key
2247 sub \$0x80, %rsp # place for tweak[8]
2248 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2251 movdqa .Lxts_magic(%rip), $twmask
2252 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2261 for ($i=0;$i<7;$i++) {
2263 pshufd \$0x13, $twtmp, $twres
2265 movdqa @XMM[7], @XMM[$i]
2266 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2267 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2268 pand $twmask, $twres # isolate carry and residue
2269 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2270 pxor $twres, @XMM[7]
2272 $code.=<<___ if ($i>=1);
2273 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2275 $code.=<<___ if ($i>=2);
2276 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2280 movdqu 0x60($inp), @XMM[8+6]
2281 pxor @XMM[8+5], @XMM[5]
2282 movdqu 0x70($inp), @XMM[8+7]
2283 lea 0x80($inp), $inp
2284 movdqa @XMM[7], 0x70(%rsp)
2285 pxor @XMM[8+6], @XMM[6]
2286 lea 0x80(%rsp), %rax # pass key schedule
2287 pxor @XMM[8+7], @XMM[7]
2288 mov %edx, %r10d # pass rounds
2290 call _bsaes_encrypt8
2292 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2293 pxor 0x10(%rsp), @XMM[1]
2294 movdqu @XMM[0], 0x00($out) # write output
2295 pxor 0x20(%rsp), @XMM[4]
2296 movdqu @XMM[1], 0x10($out)
2297 pxor 0x30(%rsp), @XMM[6]
2298 movdqu @XMM[4], 0x20($out)
2299 pxor 0x40(%rsp), @XMM[3]
2300 movdqu @XMM[6], 0x30($out)
2301 pxor 0x50(%rsp), @XMM[7]
2302 movdqu @XMM[3], 0x40($out)
2303 pxor 0x60(%rsp), @XMM[2]
2304 movdqu @XMM[7], 0x50($out)
2305 pxor 0x70(%rsp), @XMM[5]
2306 movdqu @XMM[2], 0x60($out)
2307 movdqu @XMM[5], 0x70($out)
2308 lea 0x80($out), $out
2310 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2312 movdqa .Lxts_magic(%rip), $twmask
2313 pcmpgtd @XMM[7], $twtmp
2314 pshufd \$0x13, $twtmp, $twres
2316 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2317 pand $twmask, $twres # isolate carry and residue
2318 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2319 pxor $twres, @XMM[7]
2328 for ($i=0;$i<7;$i++) {
2330 pshufd \$0x13, $twtmp, $twres
2332 movdqa @XMM[7], @XMM[$i]
2333 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2334 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2335 pand $twmask, $twres # isolate carry and residue
2336 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2337 pxor $twres, @XMM[7]
2339 $code.=<<___ if ($i>=1);
2340 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2341 cmp \$`0x10*$i`,$len
2344 $code.=<<___ if ($i>=2);
2345 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2349 movdqu 0x60($inp), @XMM[8+6]
2350 pxor @XMM[8+5], @XMM[5]
2351 movdqa @XMM[7], 0x70(%rsp)
2352 lea 0x70($inp), $inp
2353 pxor @XMM[8+6], @XMM[6]
2354 lea 0x80(%rsp), %rax # pass key schedule
2355 mov %edx, %r10d # pass rounds
2357 call _bsaes_encrypt8
2359 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2360 pxor 0x10(%rsp), @XMM[1]
2361 movdqu @XMM[0], 0x00($out) # write output
2362 pxor 0x20(%rsp), @XMM[4]
2363 movdqu @XMM[1], 0x10($out)
2364 pxor 0x30(%rsp), @XMM[6]
2365 movdqu @XMM[4], 0x20($out)
2366 pxor 0x40(%rsp), @XMM[3]
2367 movdqu @XMM[6], 0x30($out)
2368 pxor 0x50(%rsp), @XMM[7]
2369 movdqu @XMM[3], 0x40($out)
2370 pxor 0x60(%rsp), @XMM[2]
2371 movdqu @XMM[7], 0x50($out)
2372 movdqu @XMM[2], 0x60($out)
2373 lea 0x70($out), $out
2375 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2379 pxor @XMM[8+4], @XMM[4]
2380 lea 0x60($inp), $inp
2381 pxor @XMM[8+5], @XMM[5]
2382 lea 0x80(%rsp), %rax # pass key schedule
2383 mov %edx, %r10d # pass rounds
2385 call _bsaes_encrypt8
2387 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2388 pxor 0x10(%rsp), @XMM[1]
2389 movdqu @XMM[0], 0x00($out) # write output
2390 pxor 0x20(%rsp), @XMM[4]
2391 movdqu @XMM[1], 0x10($out)
2392 pxor 0x30(%rsp), @XMM[6]
2393 movdqu @XMM[4], 0x20($out)
2394 pxor 0x40(%rsp), @XMM[3]
2395 movdqu @XMM[6], 0x30($out)
2396 pxor 0x50(%rsp), @XMM[7]
2397 movdqu @XMM[3], 0x40($out)
2398 movdqu @XMM[7], 0x50($out)
2399 lea 0x60($out), $out
2401 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2405 pxor @XMM[8+3], @XMM[3]
2406 lea 0x50($inp), $inp
2407 pxor @XMM[8+4], @XMM[4]
2408 lea 0x80(%rsp), %rax # pass key schedule
2409 mov %edx, %r10d # pass rounds
2411 call _bsaes_encrypt8
2413 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2414 pxor 0x10(%rsp), @XMM[1]
2415 movdqu @XMM[0], 0x00($out) # write output
2416 pxor 0x20(%rsp), @XMM[4]
2417 movdqu @XMM[1], 0x10($out)
2418 pxor 0x30(%rsp), @XMM[6]
2419 movdqu @XMM[4], 0x20($out)
2420 pxor 0x40(%rsp), @XMM[3]
2421 movdqu @XMM[6], 0x30($out)
2422 movdqu @XMM[3], 0x40($out)
2423 lea 0x50($out), $out
2425 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2429 pxor @XMM[8+2], @XMM[2]
2430 lea 0x40($inp), $inp
2431 pxor @XMM[8+3], @XMM[3]
2432 lea 0x80(%rsp), %rax # pass key schedule
2433 mov %edx, %r10d # pass rounds
2435 call _bsaes_encrypt8
2437 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2438 pxor 0x10(%rsp), @XMM[1]
2439 movdqu @XMM[0], 0x00($out) # write output
2440 pxor 0x20(%rsp), @XMM[4]
2441 movdqu @XMM[1], 0x10($out)
2442 pxor 0x30(%rsp), @XMM[6]
2443 movdqu @XMM[4], 0x20($out)
2444 movdqu @XMM[6], 0x30($out)
2445 lea 0x40($out), $out
2447 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2451 pxor @XMM[8+1], @XMM[1]
2452 lea 0x30($inp), $inp
2453 pxor @XMM[8+2], @XMM[2]
2454 lea 0x80(%rsp), %rax # pass key schedule
2455 mov %edx, %r10d # pass rounds
2457 call _bsaes_encrypt8
2459 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2460 pxor 0x10(%rsp), @XMM[1]
2461 movdqu @XMM[0], 0x00($out) # write output
2462 pxor 0x20(%rsp), @XMM[4]
2463 movdqu @XMM[1], 0x10($out)
2464 movdqu @XMM[4], 0x20($out)
2465 lea 0x30($out), $out
2467 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2471 pxor @XMM[8+0], @XMM[0]
2472 lea 0x20($inp), $inp
2473 pxor @XMM[8+1], @XMM[1]
2474 lea 0x80(%rsp), %rax # pass key schedule
2475 mov %edx, %r10d # pass rounds
2477 call _bsaes_encrypt8
2479 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2480 pxor 0x10(%rsp), @XMM[1]
2481 movdqu @XMM[0], 0x00($out) # write output
2482 movdqu @XMM[1], 0x10($out)
2483 lea 0x20($out), $out
2485 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2489 pxor @XMM[0], @XMM[8]
2490 lea 0x10($inp), $inp
2491 movdqa @XMM[8], 0x20(%rbp)
2492 lea 0x20(%rbp), $arg1
2493 lea 0x20(%rbp), $arg2
2495 call asm_AES_encrypt # doesn't touch %xmm
2496 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2497 #pxor @XMM[8], @XMM[0]
2498 #lea 0x80(%rsp), %rax # pass key schedule
2499 #mov %edx, %r10d # pass rounds
2500 #call _bsaes_encrypt8
2501 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2502 movdqu @XMM[0], 0x00($out) # write output
2503 lea 0x10($out), $out
2505 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2514 movzb -16(%rdx), %ecx
2522 movdqu -16($out), @XMM[0]
2523 lea 0x20(%rbp), $arg1
2524 pxor @XMM[7], @XMM[0]
2525 lea 0x20(%rbp), $arg2
2526 movdqa @XMM[0], 0x20(%rbp)
2528 call asm_AES_encrypt # doesn't touch %xmm
2529 pxor 0x20(%rbp), @XMM[7]
2530 movdqu @XMM[7], -16($out)
2535 .Lxts_enc_bzero: # wipe key schedule [if any]
2536 movdqa %xmm0, 0x00(%rax)
2537 movdqa %xmm0, 0x10(%rax)
2538 lea 0x20(%rax), %rax
2545 $code.=<<___ if ($win64);
2546 movaps 0x40(%rbp), %xmm6
2547 movaps 0x50(%rbp), %xmm7
2548 movaps 0x60(%rbp), %xmm8
2549 movaps 0x70(%rbp), %xmm9
2550 movaps 0x80(%rbp), %xmm10
2551 movaps 0x90(%rbp), %xmm11
2552 movaps 0xa0(%rbp), %xmm12
2553 movaps 0xb0(%rbp), %xmm13
2554 movaps 0xc0(%rbp), %xmm14
2555 movaps 0xd0(%rbp), %xmm15
2556 lea 0xa0(%rax), %rax
2572 lea (%rax), %rsp # restore %rsp
2573 .cfi_def_cfa_register %rsp
2577 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2579 .globl bsaes_xts_decrypt
2580 .type bsaes_xts_decrypt,\@abi-omnipotent
2598 lea -0x48(%rsp), %rsp
2599 .cfi_adjust_cfa_offset 0x48
2601 $code.=<<___ if ($win64);
2602 mov 0xa0(%rsp),$arg5 # pull key2
2603 mov 0xa8(%rsp),$arg6 # pull ivp
2604 lea -0xa0(%rsp), %rsp
2605 movaps %xmm6, 0x40(%rsp)
2606 movaps %xmm7, 0x50(%rsp)
2607 movaps %xmm8, 0x60(%rsp)
2608 movaps %xmm9, 0x70(%rsp)
2609 movaps %xmm10, 0x80(%rsp)
2610 movaps %xmm11, 0x90(%rsp)
2611 movaps %xmm12, 0xa0(%rsp)
2612 movaps %xmm13, 0xb0(%rsp)
2613 movaps %xmm14, 0xc0(%rsp)
2614 movaps %xmm15, 0xd0(%rsp)
2618 mov %rsp, %rbp # backup %rsp
2619 mov $arg1, $inp # backup arguments
2625 lea 0x20(%rbp), $arg2
2627 call asm_AES_encrypt # generate initial tweak
2629 mov 240($key), %eax # rounds
2630 mov $len, %rbx # backup $len
2632 mov %eax, %edx # rounds
2633 shl \$7, %rax # 128 bytes per inner round key
2634 sub \$`128-32`, %rax # size of bit-sliced key schedule
2637 mov %rsp, %rax # pass key schedule
2638 mov $key, %rcx # pass key
2639 mov %edx, %r10d # pass rounds
2640 call _bsaes_key_convert
2641 pxor (%rsp), %xmm7 # fix up round 0 key
2642 movdqa %xmm6, (%rax) # save last round key
2643 movdqa %xmm7, (%rsp)
2645 xor %eax, %eax # if ($len%16) len-=16;
2652 sub \$0x80, %rsp # place for tweak[8]
2653 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2656 movdqa .Lxts_magic(%rip), $twmask
2657 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2666 for ($i=0;$i<7;$i++) {
2668 pshufd \$0x13, $twtmp, $twres
2670 movdqa @XMM[7], @XMM[$i]
2671 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2672 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2673 pand $twmask, $twres # isolate carry and residue
2674 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2675 pxor $twres, @XMM[7]
2677 $code.=<<___ if ($i>=1);
2678 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2680 $code.=<<___ if ($i>=2);
2681 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2685 movdqu 0x60($inp), @XMM[8+6]
2686 pxor @XMM[8+5], @XMM[5]
2687 movdqu 0x70($inp), @XMM[8+7]
2688 lea 0x80($inp), $inp
2689 movdqa @XMM[7], 0x70(%rsp)
2690 pxor @XMM[8+6], @XMM[6]
2691 lea 0x80(%rsp), %rax # pass key schedule
2692 pxor @XMM[8+7], @XMM[7]
2693 mov %edx, %r10d # pass rounds
2695 call _bsaes_decrypt8
2697 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2698 pxor 0x10(%rsp), @XMM[1]
2699 movdqu @XMM[0], 0x00($out) # write output
2700 pxor 0x20(%rsp), @XMM[6]
2701 movdqu @XMM[1], 0x10($out)
2702 pxor 0x30(%rsp), @XMM[4]
2703 movdqu @XMM[6], 0x20($out)
2704 pxor 0x40(%rsp), @XMM[2]
2705 movdqu @XMM[4], 0x30($out)
2706 pxor 0x50(%rsp), @XMM[7]
2707 movdqu @XMM[2], 0x40($out)
2708 pxor 0x60(%rsp), @XMM[3]
2709 movdqu @XMM[7], 0x50($out)
2710 pxor 0x70(%rsp), @XMM[5]
2711 movdqu @XMM[3], 0x60($out)
2712 movdqu @XMM[5], 0x70($out)
2713 lea 0x80($out), $out
2715 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2717 movdqa .Lxts_magic(%rip), $twmask
2718 pcmpgtd @XMM[7], $twtmp
2719 pshufd \$0x13, $twtmp, $twres
2721 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2722 pand $twmask, $twres # isolate carry and residue
2723 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2724 pxor $twres, @XMM[7]
2733 for ($i=0;$i<7;$i++) {
2735 pshufd \$0x13, $twtmp, $twres
2737 movdqa @XMM[7], @XMM[$i]
2738 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2739 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2740 pand $twmask, $twres # isolate carry and residue
2741 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2742 pxor $twres, @XMM[7]
2744 $code.=<<___ if ($i>=1);
2745 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2746 cmp \$`0x10*$i`,$len
2749 $code.=<<___ if ($i>=2);
2750 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2754 movdqu 0x60($inp), @XMM[8+6]
2755 pxor @XMM[8+5], @XMM[5]
2756 movdqa @XMM[7], 0x70(%rsp)
2757 lea 0x70($inp), $inp
2758 pxor @XMM[8+6], @XMM[6]
2759 lea 0x80(%rsp), %rax # pass key schedule
2760 mov %edx, %r10d # pass rounds
2762 call _bsaes_decrypt8
2764 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2765 pxor 0x10(%rsp), @XMM[1]
2766 movdqu @XMM[0], 0x00($out) # write output
2767 pxor 0x20(%rsp), @XMM[6]
2768 movdqu @XMM[1], 0x10($out)
2769 pxor 0x30(%rsp), @XMM[4]
2770 movdqu @XMM[6], 0x20($out)
2771 pxor 0x40(%rsp), @XMM[2]
2772 movdqu @XMM[4], 0x30($out)
2773 pxor 0x50(%rsp), @XMM[7]
2774 movdqu @XMM[2], 0x40($out)
2775 pxor 0x60(%rsp), @XMM[3]
2776 movdqu @XMM[7], 0x50($out)
2777 movdqu @XMM[3], 0x60($out)
2778 lea 0x70($out), $out
2780 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2784 pxor @XMM[8+4], @XMM[4]
2785 lea 0x60($inp), $inp
2786 pxor @XMM[8+5], @XMM[5]
2787 lea 0x80(%rsp), %rax # pass key schedule
2788 mov %edx, %r10d # pass rounds
2790 call _bsaes_decrypt8
2792 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2793 pxor 0x10(%rsp), @XMM[1]
2794 movdqu @XMM[0], 0x00($out) # write output
2795 pxor 0x20(%rsp), @XMM[6]
2796 movdqu @XMM[1], 0x10($out)
2797 pxor 0x30(%rsp), @XMM[4]
2798 movdqu @XMM[6], 0x20($out)
2799 pxor 0x40(%rsp), @XMM[2]
2800 movdqu @XMM[4], 0x30($out)
2801 pxor 0x50(%rsp), @XMM[7]
2802 movdqu @XMM[2], 0x40($out)
2803 movdqu @XMM[7], 0x50($out)
2804 lea 0x60($out), $out
2806 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2810 pxor @XMM[8+3], @XMM[3]
2811 lea 0x50($inp), $inp
2812 pxor @XMM[8+4], @XMM[4]
2813 lea 0x80(%rsp), %rax # pass key schedule
2814 mov %edx, %r10d # pass rounds
2816 call _bsaes_decrypt8
2818 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2819 pxor 0x10(%rsp), @XMM[1]
2820 movdqu @XMM[0], 0x00($out) # write output
2821 pxor 0x20(%rsp), @XMM[6]
2822 movdqu @XMM[1], 0x10($out)
2823 pxor 0x30(%rsp), @XMM[4]
2824 movdqu @XMM[6], 0x20($out)
2825 pxor 0x40(%rsp), @XMM[2]
2826 movdqu @XMM[4], 0x30($out)
2827 movdqu @XMM[2], 0x40($out)
2828 lea 0x50($out), $out
2830 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2834 pxor @XMM[8+2], @XMM[2]
2835 lea 0x40($inp), $inp
2836 pxor @XMM[8+3], @XMM[3]
2837 lea 0x80(%rsp), %rax # pass key schedule
2838 mov %edx, %r10d # pass rounds
2840 call _bsaes_decrypt8
2842 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2843 pxor 0x10(%rsp), @XMM[1]
2844 movdqu @XMM[0], 0x00($out) # write output
2845 pxor 0x20(%rsp), @XMM[6]
2846 movdqu @XMM[1], 0x10($out)
2847 pxor 0x30(%rsp), @XMM[4]
2848 movdqu @XMM[6], 0x20($out)
2849 movdqu @XMM[4], 0x30($out)
2850 lea 0x40($out), $out
2852 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2856 pxor @XMM[8+1], @XMM[1]
2857 lea 0x30($inp), $inp
2858 pxor @XMM[8+2], @XMM[2]
2859 lea 0x80(%rsp), %rax # pass key schedule
2860 mov %edx, %r10d # pass rounds
2862 call _bsaes_decrypt8
2864 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2865 pxor 0x10(%rsp), @XMM[1]
2866 movdqu @XMM[0], 0x00($out) # write output
2867 pxor 0x20(%rsp), @XMM[6]
2868 movdqu @XMM[1], 0x10($out)
2869 movdqu @XMM[6], 0x20($out)
2870 lea 0x30($out), $out
2872 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2876 pxor @XMM[8+0], @XMM[0]
2877 lea 0x20($inp), $inp
2878 pxor @XMM[8+1], @XMM[1]
2879 lea 0x80(%rsp), %rax # pass key schedule
2880 mov %edx, %r10d # pass rounds
2882 call _bsaes_decrypt8
2884 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2885 pxor 0x10(%rsp), @XMM[1]
2886 movdqu @XMM[0], 0x00($out) # write output
2887 movdqu @XMM[1], 0x10($out)
2888 lea 0x20($out), $out
2890 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2894 pxor @XMM[0], @XMM[8]
2895 lea 0x10($inp), $inp
2896 movdqa @XMM[8], 0x20(%rbp)
2897 lea 0x20(%rbp), $arg1
2898 lea 0x20(%rbp), $arg2
2900 call asm_AES_decrypt # doesn't touch %xmm
2901 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2902 #pxor @XMM[8], @XMM[0]
2903 #lea 0x80(%rsp), %rax # pass key schedule
2904 #mov %edx, %r10d # pass rounds
2905 #call _bsaes_decrypt8
2906 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2907 movdqu @XMM[0], 0x00($out) # write output
2908 lea 0x10($out), $out
2910 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2917 movdqa .Lxts_magic(%rip), $twmask
2918 pcmpgtd @XMM[7], $twtmp
2919 pshufd \$0x13, $twtmp, $twres
2920 movdqa @XMM[7], @XMM[6]
2921 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2922 pand $twmask, $twres # isolate carry and residue
2923 movdqu ($inp), @XMM[0]
2924 pxor $twres, @XMM[7]
2926 lea 0x20(%rbp), $arg1
2927 pxor @XMM[7], @XMM[0]
2928 lea 0x20(%rbp), $arg2
2929 movdqa @XMM[0], 0x20(%rbp)
2931 call asm_AES_decrypt # doesn't touch %xmm
2932 pxor 0x20(%rbp), @XMM[7]
2934 movdqu @XMM[7], ($out)
2937 movzb 16($inp), %eax
2946 movdqu ($out), @XMM[0]
2947 lea 0x20(%rbp), $arg1
2948 pxor @XMM[6], @XMM[0]
2949 lea 0x20(%rbp), $arg2
2950 movdqa @XMM[0], 0x20(%rbp)
2952 call asm_AES_decrypt # doesn't touch %xmm
2953 pxor 0x20(%rbp), @XMM[6]
2954 movdqu @XMM[6], ($out)
2959 .Lxts_dec_bzero: # wipe key schedule [if any]
2960 movdqa %xmm0, 0x00(%rax)
2961 movdqa %xmm0, 0x10(%rax)
2962 lea 0x20(%rax), %rax
2969 $code.=<<___ if ($win64);
2970 movaps 0x40(%rbp), %xmm6
2971 movaps 0x50(%rbp), %xmm7
2972 movaps 0x60(%rbp), %xmm8
2973 movaps 0x70(%rbp), %xmm9
2974 movaps 0x80(%rbp), %xmm10
2975 movaps 0x90(%rbp), %xmm11
2976 movaps 0xa0(%rbp), %xmm12
2977 movaps 0xb0(%rbp), %xmm13
2978 movaps 0xc0(%rbp), %xmm14
2979 movaps 0xd0(%rbp), %xmm15
2980 lea 0xa0(%rax), %rax
2996 lea (%rax), %rsp # restore %rsp
2997 .cfi_def_cfa_register %rsp
3001 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
3005 .type _bsaes_const,\@object
3008 .LM0ISR: # InvShiftRows constants
3009 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
3011 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
3013 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
3014 .LBS0: # bit-slice constants
3015 .quad 0x5555555555555555, 0x5555555555555555
3017 .quad 0x3333333333333333, 0x3333333333333333
3019 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
3020 .LSR: # shiftrows constants
3021 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
3023 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
3025 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
3026 .LSWPUP: # byte-swap upper dword
3027 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
3029 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
3030 .LADD1: # counter increment constants
3031 .quad 0x0000000000000000, 0x0000000100000000
3033 .quad 0x0000000000000000, 0x0000000200000000
3035 .quad 0x0000000000000000, 0x0000000300000000
3037 .quad 0x0000000000000000, 0x0000000400000000
3039 .quad 0x0000000000000000, 0x0000000500000000
3041 .quad 0x0000000000000000, 0x0000000600000000
3043 .quad 0x0000000000000000, 0x0000000700000000
3045 .quad 0x0000000000000000, 0x0000000800000000
3049 .quad 0x0101010101010101, 0x0101010101010101
3050 .quad 0x0202020202020202, 0x0202020202020202
3051 .quad 0x0404040404040404, 0x0404040404040404
3052 .quad 0x0808080808080808, 0x0808080808080808
3054 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
3056 .quad 0x6363636363636363, 0x6363636363636363
3057 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
3059 .size _bsaes_const,.-_bsaes_const
3062 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
3063 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
3071 .extern __imp_RtlVirtualUnwind
3072 .type se_handler,\@abi-omnipotent
3086 mov 120($context),%rax # pull context->Rax
3087 mov 248($context),%rbx # pull context->Rip
3089 mov 8($disp),%rsi # disp->ImageBase
3090 mov 56($disp),%r11 # disp->HandlerData
3092 mov 0(%r11),%r10d # HandlerData[0]
3093 lea (%rsi,%r10),%r10 # prologue label
3094 cmp %r10,%rbx # context->Rip<=prologue label
3097 mov 4(%r11),%r10d # HandlerData[1]
3098 lea (%rsi,%r10),%r10 # epilogue label
3099 cmp %r10,%rbx # context->Rip>=epilogue label
3102 mov 8(%r11),%r10d # HandlerData[2]
3103 lea (%rsi,%r10),%r10 # epilogue label
3104 cmp %r10,%rbx # context->Rip>=tail label
3107 mov 160($context),%rax # pull context->Rbp
3109 lea 0x40(%rax),%rsi # %xmm save area
3110 lea 512($context),%rdi # &context.Xmm6
3111 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
3112 .long 0xa548f3fc # cld; rep movsq
3113 lea 0xa0+0x78(%rax),%rax # adjust stack pointer
3122 mov %rbx,144($context) # restore context->Rbx
3123 mov %rbp,160($context) # restore context->Rbp
3124 mov %r12,216($context) # restore context->R12
3125 mov %r13,224($context) # restore context->R13
3126 mov %r14,232($context) # restore context->R14
3127 mov %r15,240($context) # restore context->R15
3130 mov %rax,152($context) # restore context->Rsp
3132 mov 40($disp),%rdi # disp->ContextRecord
3133 mov $context,%rsi # context
3134 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3135 .long 0xa548f3fc # cld; rep movsq
3138 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3139 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3140 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3141 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3142 mov 40(%rsi),%r10 # disp->ContextRecord
3143 lea 56(%rsi),%r11 # &disp->HandlerData
3144 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3145 mov %r10,32(%rsp) # arg5
3146 mov %r11,40(%rsp) # arg6
3147 mov %r12,48(%rsp) # arg7
3148 mov %rcx,56(%rsp) # arg8, (NULL)
3149 call *__imp_RtlVirtualUnwind(%rip)
3151 mov \$1,%eax # ExceptionContinueSearch
3163 .size se_handler,.-se_handler
3168 $code.=<<___ if ($ecb);
3169 .rva .Lecb_enc_prologue
3170 .rva .Lecb_enc_epilogue
3173 .rva .Lecb_dec_prologue
3174 .rva .Lecb_dec_epilogue
3178 .rva .Lcbc_dec_prologue
3179 .rva .Lcbc_dec_epilogue
3182 .rva .Lctr_enc_prologue
3183 .rva .Lctr_enc_epilogue
3186 .rva .Lxts_enc_prologue
3187 .rva .Lxts_enc_epilogue
3190 .rva .Lxts_dec_prologue
3191 .rva .Lxts_dec_epilogue
3197 $code.=<<___ if ($ecb);
3201 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3207 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3215 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3221 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3227 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3233 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3239 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
3243 close STDOUT or die "error closing STDOUT: $!";