3 ###################################################################
4 ### AES-128 [originally in CTR mode] ###
5 ### bitsliced implementation for Intel Core 2 processors ###
6 ### requires support of SSE extensions up to SSSE3 ###
7 ### Author: Emilia Käsper and Peter Schwabe ###
8 ### Date: 2009-03-19 ###
11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12 ### further information. ###
13 ###################################################################
17 # Started as transliteration to "perlasm" the original code has
18 # undergone following changes:
20 # - code was made position-independent;
21 # - rounds were folded into a loop resulting in >5x size reduction
22 # from 12.5KB to 2.2KB;
23 # - above was possibile thanks to mixcolumns() modification that
24 # allowed to feed its output back to aesenc[last], this was
25 # achieved at cost of two additional inter-registers moves;
26 # - some instruction reordering and interleaving;
27 # - this module doesn't implement key setup subroutine, instead it
28 # relies on conversion of "conventional" key schedule as returned
29 # by AES_set_encrypt_key (see discussion below);
30 # - first and last round keys are treated differently, which allowed
31 # to skip one shiftrows(), reduce bit-sliced key schedule and
32 # speed-up conversion by 22%;
33 # - support for 192- and 256-bit keys was added;
35 # Resulting performance in CPU cycles spent to encrypt one byte out
36 # of 4096-byte buffer with 128-bit key is:
38 # Emilia's this(*) difference
40 # Core 2 9.30 8.69 +7%
41 # Nehalem(**) 7.63 6.88 +11%
44 # (*) Comparison is not completely fair, because "this" is ECB,
45 # i.e. no extra processing such as counter values calculation
46 # and xor-ing input as in Emilia's CTR implementation is
47 # performed. However, the CTR calculations stand for not more
48 # than 1% of total time, so comparison is *rather* fair.
50 # (**) Results were collected on Westmere, which is considered to
51 # be equivalent to Nehalem for this code.
53 # As for key schedule conversion subroutine. Interface to OpenSSL
54 # relies on per-invocation on-the-fly conversion. This naturally
55 # has impact on performance, especially for short inputs. Conversion
56 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
59 # conversion conversion/8x block
64 # The ratio values mean that 128-byte blocks will be processed
65 # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
66 # etc. Then keep in mind that input sizes not divisible by 128 are
67 # *effectively* slower, especially shortest ones, e.g. consecutive
68 # 144-byte blocks are processed 44% slower than one would expect,
69 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
70 # it's still faster than ["hyper-threading-safe" code path in]
71 # aes-x86_64.pl on all lengths above 64 bytes...
75 # Add decryption procedure. Performance in CPU cycles spent to decrypt
76 # one byte out of 4096-byte buffer with 128-bit key is:
84 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
85 # suboptimal, but XTS is meant to be used with larger blocks...
91 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
93 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
95 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
96 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
97 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
98 die "can't locate x86_64-xlate.pl";
100 open OUT,"| \"$^X\" $xlate $flavour $output";
103 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
104 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
105 my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
108 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
111 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
112 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
117 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
118 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
122 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
123 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
145 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
146 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
166 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
167 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
171 &InvInBasisChange (@b);
172 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
173 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
176 sub InvInBasisChange { # OutBasisChange in reverse
177 my @b=@_[5,1,2,6,3,7,0,4];
195 sub InvOutBasisChange { # InBasisChange in reverse
196 my @b=@_[2,5,7,3,6,1,0,4];
217 #;*************************************************************
218 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
219 #;*************************************************************
220 my ($x0,$x1,$y0,$y1,$t0)=@_;
233 sub Mul_GF4_N { # not used, see next subroutine
234 # multiply and scale by N
235 my ($x0,$x1,$y0,$y1,$t0)=@_;
249 # interleaved Mul_GF4_N and Mul_GF4
250 my ($x0,$x1,$y0,$y1,$t0,
251 $x2,$x3,$y2,$y3,$t1)=@_;
279 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
286 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
287 @x[2], @x[3], @y[2], @y[3], @t[2]);
299 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
300 @x[6], @x[7], @y[2], @y[3], @t[2]);
305 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
314 #;********************************************************************
315 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
316 #;********************************************************************
320 # direct optimizations from hardware
375 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
377 # new smaller inversion
411 # output in s3, s2, s1, t1
413 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
415 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
416 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
418 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
421 # AES linear components
427 pxor 0x00($key),@x[0]
428 pxor 0x10($key),@x[1]
429 pxor 0x20($key),@x[2]
430 pxor 0x30($key),@x[3]
433 pxor 0x40($key),@x[4]
434 pxor 0x50($key),@x[5]
437 pxor 0x60($key),@x[6]
438 pxor 0x70($key),@x[7]
448 # modified to emit output in order suitable for feeding back to aesenc[last]
451 my $inv=@_[16]; # optional
453 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
454 pshufd \$0x93, @x[1], @t[1]
455 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
456 pshufd \$0x93, @x[2], @t[2]
458 pshufd \$0x93, @x[3], @t[3]
460 pshufd \$0x93, @x[4], @t[4]
462 pshufd \$0x93, @x[5], @t[5]
464 pshufd \$0x93, @x[6], @t[6]
466 pshufd \$0x93, @x[7], @t[7]
473 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
475 pshufd \$0x4E, @x[1], @x[1]
481 pshufd \$0x4E, @x[4], @t[0]
483 pshufd \$0x4E, @x[5], @t[1]
485 pshufd \$0x4E, @x[3], @x[4]
487 pshufd \$0x4E, @x[7], @x[5]
489 pshufd \$0x4E, @x[6], @x[3]
491 pshufd \$0x4E, @x[2], @x[6]
494 $code.=<<___ if (!$inv);
502 $code.=<<___ if ($inv);
515 sub InvMixColumns_orig {
520 # multiplication by 0x0e
521 pshufd \$0x93, @x[7], @t[7]
523 pxor @x[5], @x[7] # 7 5
524 pxor @x[5], @x[2] # 2 5
525 pshufd \$0x93, @x[0], @t[0]
527 pxor @x[0], @x[5] # 5 0 [1]
528 pxor @x[1], @x[0] # 0 1
529 pshufd \$0x93, @x[1], @t[1]
530 pxor @x[2], @x[1] # 1 25
531 pxor @x[6], @x[0] # 01 6 [2]
532 pxor @x[3], @x[1] # 125 3 [4]
533 pshufd \$0x93, @x[3], @t[3]
534 pxor @x[0], @x[2] # 25 016 [3]
535 pxor @x[7], @x[3] # 3 75
536 pxor @x[6], @x[7] # 75 6 [0]
537 pshufd \$0x93, @x[6], @t[6]
539 pxor @x[4], @x[6] # 6 4
540 pxor @x[3], @x[4] # 4 375 [6]
541 pxor @x[7], @x[3] # 375 756=36
542 pxor @t[5], @x[6] # 64 5 [7]
543 pxor @t[2], @x[3] # 36 2
544 pxor @t[4], @x[3] # 362 4 [5]
545 pshufd \$0x93, @t[5], @t[5]
547 my @y = @x[7,5,0,2,1,3,4,6];
549 # multiplication by 0x0b
553 pshufd \$0x93, @t[2], @t[2]
557 pshufd \$0x93, @t[4], @t[4]
558 pxor @t[6], @t[7] # clobber t[7]
562 pshufd \$0x93, @t[0], @t[0]
566 pshufd \$0x93, @t[1], @t[1]
570 pshufd \$0x93, @t[2], @t[2]
574 pshufd \$0x93, @t[3], @t[3]
580 pxor @t[5], @t[7] # clobber t[7] even more
583 pshufd \$0x93, @t[4], @t[4]
588 pshufd \$0x93, @t[5], @t[5]
589 pxor @t[6], @t[7] # restore t[7]
591 # multiplication by 0x0d
594 pshufd \$0x93, @t[6], @t[6]
598 pshufd \$0x93, @t[7], @t[7]
607 pshufd \$0x93, @t[0], @t[0]
611 pshufd \$0x93, @t[1], @t[1]
616 pshufd \$0x93, @t[2], @t[2]
618 pxor @t[3], @t[6] # clobber t[6]
625 pshufd \$0x93, @t[4], @t[4]
628 pxor @t[3], @t[6] # restore t[6]
630 pshufd \$0x93, @t[5], @t[5]
631 pshufd \$0x93, @t[6], @t[6]
632 pshufd \$0x93, @t[7], @t[7]
633 pshufd \$0x93, @t[3], @t[3]
635 # multiplication by 0x09
637 pxor @y[1], @t[1] # t[1]=y[1]
638 pxor @t[5], @t[0] # clobber t[0]
641 pxor @y[0], @t[0] # t[0]=y[0]
643 pxor @t[7], @t[6] # clobber t[6]
646 pxor @y[4], @t[4] # t[4]=y[4]
648 pxor @y[3], @t[3] # t[3]=y[3]
650 pxor @y[2], @t[2] # t[2]=y[2]
652 pxor @y[5], @t[5] # t[5]=y[5]
655 pxor @y[6], @t[6] # t[6]=y[6]
656 pxor @y[7], @t[7] # t[7]=y[7]
673 # Thanks to Jussi Kivilinna for providing pointer to
675 # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
676 # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
677 # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
678 # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
681 # multiplication by 0x05-0x00-0x04-0x00
682 pshufd \$0x4E, @x[0], @t[0]
683 pshufd \$0x4E, @x[6], @t[6]
685 pshufd \$0x4E, @x[7], @t[7]
687 pshufd \$0x4E, @x[1], @t[1]
689 pshufd \$0x4E, @x[2], @t[2]
691 pshufd \$0x4E, @x[3], @t[3]
695 pshufd \$0x4E, @x[4], @t[4]
699 pshufd \$0x4E, @x[5], @t[5]
714 &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
717 sub aesenc { # not used
721 movdqa 0x30($const),@t[0] # .LSR
723 &ShiftRows (@b,@t[0]);
725 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
728 sub aesenclast { # not used
732 movdqa 0x40($const),@t[0] # .LSRM0
734 &ShiftRows (@b,@t[0]);
737 pxor 0x00($key),@b[0]
738 pxor 0x10($key),@b[1]
739 pxor 0x20($key),@b[4]
740 pxor 0x30($key),@b[6]
741 pxor 0x40($key),@b[3]
742 pxor 0x50($key),@b[7]
743 pxor 0x60($key),@b[2]
744 pxor 0x70($key),@b[5]
749 my ($a,$b,$n,$mask,$t)=@_;
761 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
781 my @x=reverse(@_[0..7]);
782 my ($t0,$t1,$t2,$t3)=@_[8..11];
784 movdqa 0x00($const),$t0 # .LBS0
785 movdqa 0x10($const),$t1 # .LBS1
787 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
788 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
790 movdqa 0x20($const),$t0 # .LBS2
792 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
793 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
795 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
796 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
802 .extern asm_AES_encrypt
803 .extern asm_AES_decrypt
805 .type _bsaes_encrypt8,\@abi-omnipotent
808 lea .LBS0(%rip), $const # constants table
810 movdqa ($key), @XMM[9] # round 0 key
812 movdqa 0x50($const), @XMM[8] # .LM0SR
813 pxor @XMM[9], @XMM[0] # xor with round0 key
814 pxor @XMM[9], @XMM[1]
815 pxor @XMM[9], @XMM[2]
816 pxor @XMM[9], @XMM[3]
817 pshufb @XMM[8], @XMM[0]
818 pshufb @XMM[8], @XMM[1]
819 pxor @XMM[9], @XMM[4]
820 pxor @XMM[9], @XMM[5]
821 pshufb @XMM[8], @XMM[2]
822 pshufb @XMM[8], @XMM[3]
823 pxor @XMM[9], @XMM[6]
824 pxor @XMM[9], @XMM[7]
825 pshufb @XMM[8], @XMM[4]
826 pshufb @XMM[8], @XMM[5]
827 pshufb @XMM[8], @XMM[6]
828 pshufb @XMM[8], @XMM[7]
829 _bsaes_encrypt8_bitslice:
831 &bitslice (@XMM[0..7, 8..11]);
838 &ShiftRows (@XMM[0..7, 8]);
839 $code.=".Lenc_sbox:\n";
840 &Sbox (@XMM[0..7, 8..15]);
845 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
847 movdqa 0x30($const), @XMM[8] # .LSR
849 movdqa 0x40($const), @XMM[8] # .LSRM0
854 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
855 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
857 movdqa ($key), @XMM[8] # last round key
858 pxor @XMM[8], @XMM[4]
859 pxor @XMM[8], @XMM[6]
860 pxor @XMM[8], @XMM[3]
861 pxor @XMM[8], @XMM[7]
862 pxor @XMM[8], @XMM[2]
863 pxor @XMM[8], @XMM[5]
864 pxor @XMM[8], @XMM[0]
865 pxor @XMM[8], @XMM[1]
867 .size _bsaes_encrypt8,.-_bsaes_encrypt8
869 .type _bsaes_decrypt8,\@abi-omnipotent
872 lea .LBS0(%rip), $const # constants table
874 movdqa ($key), @XMM[9] # round 0 key
876 movdqa -0x30($const), @XMM[8] # .LM0ISR
877 pxor @XMM[9], @XMM[0] # xor with round0 key
878 pxor @XMM[9], @XMM[1]
879 pxor @XMM[9], @XMM[2]
880 pxor @XMM[9], @XMM[3]
881 pshufb @XMM[8], @XMM[0]
882 pshufb @XMM[8], @XMM[1]
883 pxor @XMM[9], @XMM[4]
884 pxor @XMM[9], @XMM[5]
885 pshufb @XMM[8], @XMM[2]
886 pshufb @XMM[8], @XMM[3]
887 pxor @XMM[9], @XMM[6]
888 pxor @XMM[9], @XMM[7]
889 pshufb @XMM[8], @XMM[4]
890 pshufb @XMM[8], @XMM[5]
891 pshufb @XMM[8], @XMM[6]
892 pshufb @XMM[8], @XMM[7]
894 &bitslice (@XMM[0..7, 8..11]);
901 &ShiftRows (@XMM[0..7, 8]);
902 $code.=".Ldec_sbox:\n";
903 &InvSbox (@XMM[0..7, 8..15]);
908 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
910 movdqa -0x10($const), @XMM[8] # .LISR
912 movdqa -0x20($const), @XMM[8] # .LISRM0
917 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
919 movdqa ($key), @XMM[8] # last round key
920 pxor @XMM[8], @XMM[6]
921 pxor @XMM[8], @XMM[4]
922 pxor @XMM[8], @XMM[2]
923 pxor @XMM[8], @XMM[7]
924 pxor @XMM[8], @XMM[3]
925 pxor @XMM[8], @XMM[5]
926 pxor @XMM[8], @XMM[0]
927 pxor @XMM[8], @XMM[1]
929 .size _bsaes_decrypt8,.-_bsaes_decrypt8
933 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
936 my @x=reverse(@_[0..7]);
937 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
939 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
941 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
945 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
947 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
949 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
955 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
956 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
960 .type _bsaes_key_convert,\@abi-omnipotent
963 lea .Lmasks(%rip), $const
964 movdqu ($inp), %xmm7 # load round 0 key
966 movdqa 0x00($const), %xmm0 # 0x01...
967 movdqa 0x10($const), %xmm1 # 0x02...
968 movdqa 0x20($const), %xmm2 # 0x04...
969 movdqa 0x30($const), %xmm3 # 0x08...
970 movdqa 0x40($const), %xmm4 # .LM0
971 pcmpeqd %xmm5, %xmm5 # .LNOT
973 movdqu ($inp), %xmm6 # load round 1 key
974 movdqa %xmm7, ($out) # save round 0 key
980 pshufb %xmm4, %xmm6 # .LM0
989 psllq \$4, %xmm0 # 0x10...
992 psllq \$4, %xmm1 # 0x20...
997 pcmpeqb %xmm2, %xmm10
998 psllq \$4, %xmm2 # 0x40...
1000 pcmpeqb %xmm3, %xmm11
1001 psllq \$4, %xmm3 # 0x80...
1003 movdqa %xmm2, %xmm14
1004 movdqa %xmm3, %xmm15
1005 pxor %xmm5, %xmm8 # "pnot"
1010 movdqa %xmm8, 0x00($out) # write bit-sliced round key
1011 pcmpeqb %xmm0, %xmm12
1012 psrlq \$4, %xmm0 # 0x01...
1013 movdqa %xmm9, 0x10($out)
1014 pcmpeqb %xmm1, %xmm13
1015 psrlq \$4, %xmm1 # 0x02...
1016 lea 0x10($inp), $inp
1020 movdqa %xmm10, 0x20($out)
1021 pcmpeqb %xmm2, %xmm14
1022 psrlq \$4, %xmm2 # 0x04...
1023 movdqa %xmm11, 0x30($out)
1024 pcmpeqb %xmm3, %xmm15
1025 psrlq \$4, %xmm3 # 0x08...
1026 movdqu ($inp), %xmm6 # load next round key
1028 pxor %xmm5, %xmm13 # "pnot"
1030 movdqa %xmm12, 0x40($out)
1031 movdqa %xmm13, 0x50($out)
1032 movdqa %xmm14, 0x60($out)
1033 movdqa %xmm15, 0x70($out)
1038 movdqa 0x50($const), %xmm7 # .L63
1039 #movdqa %xmm6, ($out) # don't save last round key
1041 .size _bsaes_key_convert,.-_bsaes_key_convert
1045 if (0 && !$win64) { # following four functions are unsupported interface
1046 # used for benchmarking...
1048 .globl bsaes_enc_key_convert
1049 .type bsaes_enc_key_convert,\@function,2
1051 bsaes_enc_key_convert:
1052 mov 240($inp),%r10d # pass rounds
1053 mov $inp,%rcx # pass key
1054 mov $out,%rax # pass key schedule
1055 call _bsaes_key_convert
1056 pxor %xmm6,%xmm7 # fix up last round key
1057 movdqa %xmm7,(%rax) # save last round key
1059 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
1061 .globl bsaes_encrypt_128
1062 .type bsaes_encrypt_128,\@function,4
1066 movdqu 0x00($inp), @XMM[0] # load input
1067 movdqu 0x10($inp), @XMM[1]
1068 movdqu 0x20($inp), @XMM[2]
1069 movdqu 0x30($inp), @XMM[3]
1070 movdqu 0x40($inp), @XMM[4]
1071 movdqu 0x50($inp), @XMM[5]
1072 movdqu 0x60($inp), @XMM[6]
1073 movdqu 0x70($inp), @XMM[7]
1074 mov $key, %rax # pass the $key
1075 lea 0x80($inp), $inp
1078 call _bsaes_encrypt8
1080 movdqu @XMM[0], 0x00($out) # write output
1081 movdqu @XMM[1], 0x10($out)
1082 movdqu @XMM[4], 0x20($out)
1083 movdqu @XMM[6], 0x30($out)
1084 movdqu @XMM[3], 0x40($out)
1085 movdqu @XMM[7], 0x50($out)
1086 movdqu @XMM[2], 0x60($out)
1087 movdqu @XMM[5], 0x70($out)
1088 lea 0x80($out), $out
1092 .size bsaes_encrypt_128,.-bsaes_encrypt_128
1094 .globl bsaes_dec_key_convert
1095 .type bsaes_dec_key_convert,\@function,2
1097 bsaes_dec_key_convert:
1098 mov 240($inp),%r10d # pass rounds
1099 mov $inp,%rcx # pass key
1100 mov $out,%rax # pass key schedule
1101 call _bsaes_key_convert
1102 pxor ($out),%xmm7 # fix up round 0 key
1103 movdqa %xmm6,(%rax) # save last round key
1106 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1108 .globl bsaes_decrypt_128
1109 .type bsaes_decrypt_128,\@function,4
1113 movdqu 0x00($inp), @XMM[0] # load input
1114 movdqu 0x10($inp), @XMM[1]
1115 movdqu 0x20($inp), @XMM[2]
1116 movdqu 0x30($inp), @XMM[3]
1117 movdqu 0x40($inp), @XMM[4]
1118 movdqu 0x50($inp), @XMM[5]
1119 movdqu 0x60($inp), @XMM[6]
1120 movdqu 0x70($inp), @XMM[7]
1121 mov $key, %rax # pass the $key
1122 lea 0x80($inp), $inp
1125 call _bsaes_decrypt8
1127 movdqu @XMM[0], 0x00($out) # write output
1128 movdqu @XMM[1], 0x10($out)
1129 movdqu @XMM[6], 0x20($out)
1130 movdqu @XMM[4], 0x30($out)
1131 movdqu @XMM[2], 0x40($out)
1132 movdqu @XMM[7], 0x50($out)
1133 movdqu @XMM[3], 0x60($out)
1134 movdqu @XMM[5], 0x70($out)
1135 lea 0x80($out), $out
1139 .size bsaes_decrypt_128,.-bsaes_decrypt_128
1143 ######################################################################
1147 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1148 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1149 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1153 .globl bsaes_ecb_encrypt_blocks
1154 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1156 bsaes_ecb_encrypt_blocks:
1165 lea -0x48(%rsp),%rsp
1167 $code.=<<___ if ($win64);
1168 lea -0xa0(%rsp), %rsp
1169 movaps %xmm6, 0x40(%rsp)
1170 movaps %xmm7, 0x50(%rsp)
1171 movaps %xmm8, 0x60(%rsp)
1172 movaps %xmm9, 0x70(%rsp)
1173 movaps %xmm10, 0x80(%rsp)
1174 movaps %xmm11, 0x90(%rsp)
1175 movaps %xmm12, 0xa0(%rsp)
1176 movaps %xmm13, 0xb0(%rsp)
1177 movaps %xmm14, 0xc0(%rsp)
1178 movaps %xmm15, 0xd0(%rsp)
1182 mov %rsp,%rbp # backup %rsp
1183 mov 240($arg4),%eax # rounds
1184 mov $arg1,$inp # backup arguments
1191 mov %eax,%ebx # backup rounds
1192 shl \$7,%rax # 128 bytes per inner round key
1193 sub \$`128-32`,%rax # size of bit-sliced key schedule
1195 mov %rsp,%rax # pass key schedule
1196 mov $key,%rcx # pass key
1197 mov %ebx,%r10d # pass rounds
1198 call _bsaes_key_convert
1199 pxor %xmm6,%xmm7 # fix up last round key
1200 movdqa %xmm7,(%rax) # save last round key
1204 movdqu 0x00($inp), @XMM[0] # load input
1205 movdqu 0x10($inp), @XMM[1]
1206 movdqu 0x20($inp), @XMM[2]
1207 movdqu 0x30($inp), @XMM[3]
1208 movdqu 0x40($inp), @XMM[4]
1209 movdqu 0x50($inp), @XMM[5]
1210 mov %rsp, %rax # pass key schedule
1211 movdqu 0x60($inp), @XMM[6]
1212 mov %ebx,%r10d # pass rounds
1213 movdqu 0x70($inp), @XMM[7]
1214 lea 0x80($inp), $inp
1216 call _bsaes_encrypt8
1218 movdqu @XMM[0], 0x00($out) # write output
1219 movdqu @XMM[1], 0x10($out)
1220 movdqu @XMM[4], 0x20($out)
1221 movdqu @XMM[6], 0x30($out)
1222 movdqu @XMM[3], 0x40($out)
1223 movdqu @XMM[7], 0x50($out)
1224 movdqu @XMM[2], 0x60($out)
1225 movdqu @XMM[5], 0x70($out)
1226 lea 0x80($out), $out
1233 movdqu 0x00($inp), @XMM[0] # load input
1234 mov %rsp, %rax # pass key schedule
1235 mov %ebx,%r10d # pass rounds
1238 movdqu 0x10($inp), @XMM[1]
1240 movdqu 0x20($inp), @XMM[2]
1243 movdqu 0x30($inp), @XMM[3]
1245 movdqu 0x40($inp), @XMM[4]
1248 movdqu 0x50($inp), @XMM[5]
1250 movdqu 0x60($inp), @XMM[6]
1251 call _bsaes_encrypt8
1252 movdqu @XMM[0], 0x00($out) # write output
1253 movdqu @XMM[1], 0x10($out)
1254 movdqu @XMM[4], 0x20($out)
1255 movdqu @XMM[6], 0x30($out)
1256 movdqu @XMM[3], 0x40($out)
1257 movdqu @XMM[7], 0x50($out)
1258 movdqu @XMM[2], 0x60($out)
1262 call _bsaes_encrypt8
1263 movdqu @XMM[0], 0x00($out) # write output
1264 movdqu @XMM[1], 0x10($out)
1265 movdqu @XMM[4], 0x20($out)
1266 movdqu @XMM[6], 0x30($out)
1267 movdqu @XMM[3], 0x40($out)
1268 movdqu @XMM[7], 0x50($out)
1272 call _bsaes_encrypt8
1273 movdqu @XMM[0], 0x00($out) # write output
1274 movdqu @XMM[1], 0x10($out)
1275 movdqu @XMM[4], 0x20($out)
1276 movdqu @XMM[6], 0x30($out)
1277 movdqu @XMM[3], 0x40($out)
1281 call _bsaes_encrypt8
1282 movdqu @XMM[0], 0x00($out) # write output
1283 movdqu @XMM[1], 0x10($out)
1284 movdqu @XMM[4], 0x20($out)
1285 movdqu @XMM[6], 0x30($out)
1289 call _bsaes_encrypt8
1290 movdqu @XMM[0], 0x00($out) # write output
1291 movdqu @XMM[1], 0x10($out)
1292 movdqu @XMM[4], 0x20($out)
1296 call _bsaes_encrypt8
1297 movdqu @XMM[0], 0x00($out) # write output
1298 movdqu @XMM[1], 0x10($out)
1302 call _bsaes_encrypt8
1303 movdqu @XMM[0], 0x00($out) # write output
1310 call asm_AES_encrypt
1319 .Lecb_enc_bzero: # wipe key schedule [if any]
1320 movdqa %xmm0, 0x00(%rax)
1321 movdqa %xmm0, 0x10(%rax)
1322 lea 0x20(%rax), %rax
1326 lea (%rbp),%rsp # restore %rsp
1328 $code.=<<___ if ($win64);
1329 movaps 0x40(%rbp), %xmm6
1330 movaps 0x50(%rbp), %xmm7
1331 movaps 0x60(%rbp), %xmm8
1332 movaps 0x70(%rbp), %xmm9
1333 movaps 0x80(%rbp), %xmm10
1334 movaps 0x90(%rbp), %xmm11
1335 movaps 0xa0(%rbp), %xmm12
1336 movaps 0xb0(%rbp), %xmm13
1337 movaps 0xc0(%rbp), %xmm14
1338 movaps 0xd0(%rbp), %xmm15
1339 lea 0xa0(%rbp), %rsp
1342 mov 0x48(%rsp), %r15
1343 mov 0x50(%rsp), %r14
1344 mov 0x58(%rsp), %r13
1345 mov 0x60(%rsp), %r12
1346 mov 0x68(%rsp), %rbx
1347 mov 0x70(%rsp), %rax
1348 lea 0x78(%rsp), %rsp
1352 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1354 .globl bsaes_ecb_decrypt_blocks
1355 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1357 bsaes_ecb_decrypt_blocks:
1366 lea -0x48(%rsp),%rsp
1368 $code.=<<___ if ($win64);
1369 lea -0xa0(%rsp), %rsp
1370 movaps %xmm6, 0x40(%rsp)
1371 movaps %xmm7, 0x50(%rsp)
1372 movaps %xmm8, 0x60(%rsp)
1373 movaps %xmm9, 0x70(%rsp)
1374 movaps %xmm10, 0x80(%rsp)
1375 movaps %xmm11, 0x90(%rsp)
1376 movaps %xmm12, 0xa0(%rsp)
1377 movaps %xmm13, 0xb0(%rsp)
1378 movaps %xmm14, 0xc0(%rsp)
1379 movaps %xmm15, 0xd0(%rsp)
1383 mov %rsp,%rbp # backup %rsp
1384 mov 240($arg4),%eax # rounds
1385 mov $arg1,$inp # backup arguments
1392 mov %eax,%ebx # backup rounds
1393 shl \$7,%rax # 128 bytes per inner round key
1394 sub \$`128-32`,%rax # size of bit-sliced key schedule
1396 mov %rsp,%rax # pass key schedule
1397 mov $key,%rcx # pass key
1398 mov %ebx,%r10d # pass rounds
1399 call _bsaes_key_convert
1400 pxor (%rsp),%xmm7 # fix up 0 round key
1401 movdqa %xmm6,(%rax) # save last round key
1406 movdqu 0x00($inp), @XMM[0] # load input
1407 movdqu 0x10($inp), @XMM[1]
1408 movdqu 0x20($inp), @XMM[2]
1409 movdqu 0x30($inp), @XMM[3]
1410 movdqu 0x40($inp), @XMM[4]
1411 movdqu 0x50($inp), @XMM[5]
1412 mov %rsp, %rax # pass key schedule
1413 movdqu 0x60($inp), @XMM[6]
1414 mov %ebx,%r10d # pass rounds
1415 movdqu 0x70($inp), @XMM[7]
1416 lea 0x80($inp), $inp
1418 call _bsaes_decrypt8
1420 movdqu @XMM[0], 0x00($out) # write output
1421 movdqu @XMM[1], 0x10($out)
1422 movdqu @XMM[6], 0x20($out)
1423 movdqu @XMM[4], 0x30($out)
1424 movdqu @XMM[2], 0x40($out)
1425 movdqu @XMM[7], 0x50($out)
1426 movdqu @XMM[3], 0x60($out)
1427 movdqu @XMM[5], 0x70($out)
1428 lea 0x80($out), $out
1435 movdqu 0x00($inp), @XMM[0] # load input
1436 mov %rsp, %rax # pass key schedule
1437 mov %ebx,%r10d # pass rounds
1440 movdqu 0x10($inp), @XMM[1]
1442 movdqu 0x20($inp), @XMM[2]
1445 movdqu 0x30($inp), @XMM[3]
1447 movdqu 0x40($inp), @XMM[4]
1450 movdqu 0x50($inp), @XMM[5]
1452 movdqu 0x60($inp), @XMM[6]
1453 call _bsaes_decrypt8
1454 movdqu @XMM[0], 0x00($out) # write output
1455 movdqu @XMM[1], 0x10($out)
1456 movdqu @XMM[6], 0x20($out)
1457 movdqu @XMM[4], 0x30($out)
1458 movdqu @XMM[2], 0x40($out)
1459 movdqu @XMM[7], 0x50($out)
1460 movdqu @XMM[3], 0x60($out)
1464 call _bsaes_decrypt8
1465 movdqu @XMM[0], 0x00($out) # write output
1466 movdqu @XMM[1], 0x10($out)
1467 movdqu @XMM[6], 0x20($out)
1468 movdqu @XMM[4], 0x30($out)
1469 movdqu @XMM[2], 0x40($out)
1470 movdqu @XMM[7], 0x50($out)
1474 call _bsaes_decrypt8
1475 movdqu @XMM[0], 0x00($out) # write output
1476 movdqu @XMM[1], 0x10($out)
1477 movdqu @XMM[6], 0x20($out)
1478 movdqu @XMM[4], 0x30($out)
1479 movdqu @XMM[2], 0x40($out)
1483 call _bsaes_decrypt8
1484 movdqu @XMM[0], 0x00($out) # write output
1485 movdqu @XMM[1], 0x10($out)
1486 movdqu @XMM[6], 0x20($out)
1487 movdqu @XMM[4], 0x30($out)
1491 call _bsaes_decrypt8
1492 movdqu @XMM[0], 0x00($out) # write output
1493 movdqu @XMM[1], 0x10($out)
1494 movdqu @XMM[6], 0x20($out)
1498 call _bsaes_decrypt8
1499 movdqu @XMM[0], 0x00($out) # write output
1500 movdqu @XMM[1], 0x10($out)
1504 call _bsaes_decrypt8
1505 movdqu @XMM[0], 0x00($out) # write output
1512 call asm_AES_decrypt
1521 .Lecb_dec_bzero: # wipe key schedule [if any]
1522 movdqa %xmm0, 0x00(%rax)
1523 movdqa %xmm0, 0x10(%rax)
1524 lea 0x20(%rax), %rax
1528 lea (%rbp),%rsp # restore %rsp
1530 $code.=<<___ if ($win64);
1531 movaps 0x40(%rbp), %xmm6
1532 movaps 0x50(%rbp), %xmm7
1533 movaps 0x60(%rbp), %xmm8
1534 movaps 0x70(%rbp), %xmm9
1535 movaps 0x80(%rbp), %xmm10
1536 movaps 0x90(%rbp), %xmm11
1537 movaps 0xa0(%rbp), %xmm12
1538 movaps 0xb0(%rbp), %xmm13
1539 movaps 0xc0(%rbp), %xmm14
1540 movaps 0xd0(%rbp), %xmm15
1541 lea 0xa0(%rbp), %rsp
1544 mov 0x48(%rsp), %r15
1545 mov 0x50(%rsp), %r14
1546 mov 0x58(%rsp), %r13
1547 mov 0x60(%rsp), %r12
1548 mov 0x68(%rsp), %rbx
1549 mov 0x70(%rsp), %rax
1550 lea 0x78(%rsp), %rsp
1554 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1558 .extern asm_AES_cbc_encrypt
1559 .globl bsaes_cbc_encrypt
1560 .type bsaes_cbc_encrypt,\@abi-omnipotent
1564 $code.=<<___ if ($win64);
1565 mov 48(%rsp),$arg6 # pull direction flag
1569 jne asm_AES_cbc_encrypt
1571 jb asm_AES_cbc_encrypt
1581 lea -0x48(%rsp), %rsp
1583 $code.=<<___ if ($win64);
1584 mov 0xa0(%rsp),$arg5 # pull ivp
1585 lea -0xa0(%rsp), %rsp
1586 movaps %xmm6, 0x40(%rsp)
1587 movaps %xmm7, 0x50(%rsp)
1588 movaps %xmm8, 0x60(%rsp)
1589 movaps %xmm9, 0x70(%rsp)
1590 movaps %xmm10, 0x80(%rsp)
1591 movaps %xmm11, 0x90(%rsp)
1592 movaps %xmm12, 0xa0(%rsp)
1593 movaps %xmm13, 0xb0(%rsp)
1594 movaps %xmm14, 0xc0(%rsp)
1595 movaps %xmm15, 0xd0(%rsp)
1599 mov %rsp, %rbp # backup %rsp
1600 mov 240($arg4), %eax # rounds
1601 mov $arg1, $inp # backup arguments
1606 shr \$4, $len # bytes to blocks
1608 mov %eax, %edx # rounds
1609 shl \$7, %rax # 128 bytes per inner round key
1610 sub \$`128-32`, %rax # size of bit-sliced key schedule
1613 mov %rsp, %rax # pass key schedule
1614 mov $key, %rcx # pass key
1615 mov %edx, %r10d # pass rounds
1616 call _bsaes_key_convert
1617 pxor (%rsp),%xmm7 # fix up 0 round key
1618 movdqa %xmm6,(%rax) # save last round key
1621 movdqu (%rbx), @XMM[15] # load IV
1624 movdqu 0x00($inp), @XMM[0] # load input
1625 movdqu 0x10($inp), @XMM[1]
1626 movdqu 0x20($inp), @XMM[2]
1627 movdqu 0x30($inp), @XMM[3]
1628 movdqu 0x40($inp), @XMM[4]
1629 movdqu 0x50($inp), @XMM[5]
1630 mov %rsp, %rax # pass key schedule
1631 movdqu 0x60($inp), @XMM[6]
1632 mov %edx,%r10d # pass rounds
1633 movdqu 0x70($inp), @XMM[7]
1634 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1636 call _bsaes_decrypt8
1638 pxor 0x20(%rbp), @XMM[0] # ^= IV
1639 movdqu 0x00($inp), @XMM[8] # re-load input
1640 movdqu 0x10($inp), @XMM[9]
1641 pxor @XMM[8], @XMM[1]
1642 movdqu 0x20($inp), @XMM[10]
1643 pxor @XMM[9], @XMM[6]
1644 movdqu 0x30($inp), @XMM[11]
1645 pxor @XMM[10], @XMM[4]
1646 movdqu 0x40($inp), @XMM[12]
1647 pxor @XMM[11], @XMM[2]
1648 movdqu 0x50($inp), @XMM[13]
1649 pxor @XMM[12], @XMM[7]
1650 movdqu 0x60($inp), @XMM[14]
1651 pxor @XMM[13], @XMM[3]
1652 movdqu 0x70($inp), @XMM[15] # IV
1653 pxor @XMM[14], @XMM[5]
1654 movdqu @XMM[0], 0x00($out) # write output
1655 lea 0x80($inp), $inp
1656 movdqu @XMM[1], 0x10($out)
1657 movdqu @XMM[6], 0x20($out)
1658 movdqu @XMM[4], 0x30($out)
1659 movdqu @XMM[2], 0x40($out)
1660 movdqu @XMM[7], 0x50($out)
1661 movdqu @XMM[3], 0x60($out)
1662 movdqu @XMM[5], 0x70($out)
1663 lea 0x80($out), $out
1670 movdqu 0x00($inp), @XMM[0] # load input
1671 mov %rsp, %rax # pass key schedule
1672 mov %edx, %r10d # pass rounds
1675 movdqu 0x10($inp), @XMM[1]
1677 movdqu 0x20($inp), @XMM[2]
1680 movdqu 0x30($inp), @XMM[3]
1682 movdqu 0x40($inp), @XMM[4]
1685 movdqu 0x50($inp), @XMM[5]
1687 movdqu 0x60($inp), @XMM[6]
1688 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1689 call _bsaes_decrypt8
1690 pxor 0x20(%rbp), @XMM[0] # ^= IV
1691 movdqu 0x00($inp), @XMM[8] # re-load input
1692 movdqu 0x10($inp), @XMM[9]
1693 pxor @XMM[8], @XMM[1]
1694 movdqu 0x20($inp), @XMM[10]
1695 pxor @XMM[9], @XMM[6]
1696 movdqu 0x30($inp), @XMM[11]
1697 pxor @XMM[10], @XMM[4]
1698 movdqu 0x40($inp), @XMM[12]
1699 pxor @XMM[11], @XMM[2]
1700 movdqu 0x50($inp), @XMM[13]
1701 pxor @XMM[12], @XMM[7]
1702 movdqu 0x60($inp), @XMM[15] # IV
1703 pxor @XMM[13], @XMM[3]
1704 movdqu @XMM[0], 0x00($out) # write output
1705 movdqu @XMM[1], 0x10($out)
1706 movdqu @XMM[6], 0x20($out)
1707 movdqu @XMM[4], 0x30($out)
1708 movdqu @XMM[2], 0x40($out)
1709 movdqu @XMM[7], 0x50($out)
1710 movdqu @XMM[3], 0x60($out)
1714 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1715 call _bsaes_decrypt8
1716 pxor 0x20(%rbp), @XMM[0] # ^= IV
1717 movdqu 0x00($inp), @XMM[8] # re-load input
1718 movdqu 0x10($inp), @XMM[9]
1719 pxor @XMM[8], @XMM[1]
1720 movdqu 0x20($inp), @XMM[10]
1721 pxor @XMM[9], @XMM[6]
1722 movdqu 0x30($inp), @XMM[11]
1723 pxor @XMM[10], @XMM[4]
1724 movdqu 0x40($inp), @XMM[12]
1725 pxor @XMM[11], @XMM[2]
1726 movdqu 0x50($inp), @XMM[15] # IV
1727 pxor @XMM[12], @XMM[7]
1728 movdqu @XMM[0], 0x00($out) # write output
1729 movdqu @XMM[1], 0x10($out)
1730 movdqu @XMM[6], 0x20($out)
1731 movdqu @XMM[4], 0x30($out)
1732 movdqu @XMM[2], 0x40($out)
1733 movdqu @XMM[7], 0x50($out)
1737 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1738 call _bsaes_decrypt8
1739 pxor 0x20(%rbp), @XMM[0] # ^= IV
1740 movdqu 0x00($inp), @XMM[8] # re-load input
1741 movdqu 0x10($inp), @XMM[9]
1742 pxor @XMM[8], @XMM[1]
1743 movdqu 0x20($inp), @XMM[10]
1744 pxor @XMM[9], @XMM[6]
1745 movdqu 0x30($inp), @XMM[11]
1746 pxor @XMM[10], @XMM[4]
1747 movdqu 0x40($inp), @XMM[15] # IV
1748 pxor @XMM[11], @XMM[2]
1749 movdqu @XMM[0], 0x00($out) # write output
1750 movdqu @XMM[1], 0x10($out)
1751 movdqu @XMM[6], 0x20($out)
1752 movdqu @XMM[4], 0x30($out)
1753 movdqu @XMM[2], 0x40($out)
1757 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1758 call _bsaes_decrypt8
1759 pxor 0x20(%rbp), @XMM[0] # ^= IV
1760 movdqu 0x00($inp), @XMM[8] # re-load input
1761 movdqu 0x10($inp), @XMM[9]
1762 pxor @XMM[8], @XMM[1]
1763 movdqu 0x20($inp), @XMM[10]
1764 pxor @XMM[9], @XMM[6]
1765 movdqu 0x30($inp), @XMM[15] # IV
1766 pxor @XMM[10], @XMM[4]
1767 movdqu @XMM[0], 0x00($out) # write output
1768 movdqu @XMM[1], 0x10($out)
1769 movdqu @XMM[6], 0x20($out)
1770 movdqu @XMM[4], 0x30($out)
1774 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1775 call _bsaes_decrypt8
1776 pxor 0x20(%rbp), @XMM[0] # ^= IV
1777 movdqu 0x00($inp), @XMM[8] # re-load input
1778 movdqu 0x10($inp), @XMM[9]
1779 pxor @XMM[8], @XMM[1]
1780 movdqu 0x20($inp), @XMM[15] # IV
1781 pxor @XMM[9], @XMM[6]
1782 movdqu @XMM[0], 0x00($out) # write output
1783 movdqu @XMM[1], 0x10($out)
1784 movdqu @XMM[6], 0x20($out)
1788 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1789 call _bsaes_decrypt8
1790 pxor 0x20(%rbp), @XMM[0] # ^= IV
1791 movdqu 0x00($inp), @XMM[8] # re-load input
1792 movdqu 0x10($inp), @XMM[15] # IV
1793 pxor @XMM[8], @XMM[1]
1794 movdqu @XMM[0], 0x00($out) # write output
1795 movdqu @XMM[1], 0x10($out)
1800 lea 0x20(%rbp), $arg2 # buffer output
1802 call asm_AES_decrypt # doesn't touch %xmm
1803 pxor 0x20(%rbp), @XMM[15] # ^= IV
1804 movdqu @XMM[15], ($out) # write output
1805 movdqa @XMM[0], @XMM[15] # IV
1808 movdqu @XMM[15], (%rbx) # return IV
1811 .Lcbc_dec_bzero: # wipe key schedule [if any]
1812 movdqa %xmm0, 0x00(%rax)
1813 movdqa %xmm0, 0x10(%rax)
1814 lea 0x20(%rax), %rax
1818 lea (%rbp),%rsp # restore %rsp
1820 $code.=<<___ if ($win64);
1821 movaps 0x40(%rbp), %xmm6
1822 movaps 0x50(%rbp), %xmm7
1823 movaps 0x60(%rbp), %xmm8
1824 movaps 0x70(%rbp), %xmm9
1825 movaps 0x80(%rbp), %xmm10
1826 movaps 0x90(%rbp), %xmm11
1827 movaps 0xa0(%rbp), %xmm12
1828 movaps 0xb0(%rbp), %xmm13
1829 movaps 0xc0(%rbp), %xmm14
1830 movaps 0xd0(%rbp), %xmm15
1831 lea 0xa0(%rbp), %rsp
1834 mov 0x48(%rsp), %r15
1835 mov 0x50(%rsp), %r14
1836 mov 0x58(%rsp), %r13
1837 mov 0x60(%rsp), %r12
1838 mov 0x68(%rsp), %rbx
1839 mov 0x70(%rsp), %rax
1840 lea 0x78(%rsp), %rsp
1844 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1846 .globl bsaes_ctr32_encrypt_blocks
1847 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1849 bsaes_ctr32_encrypt_blocks:
1858 lea -0x48(%rsp), %rsp
1860 $code.=<<___ if ($win64);
1861 mov 0xa0(%rsp),$arg5 # pull ivp
1862 lea -0xa0(%rsp), %rsp
1863 movaps %xmm6, 0x40(%rsp)
1864 movaps %xmm7, 0x50(%rsp)
1865 movaps %xmm8, 0x60(%rsp)
1866 movaps %xmm9, 0x70(%rsp)
1867 movaps %xmm10, 0x80(%rsp)
1868 movaps %xmm11, 0x90(%rsp)
1869 movaps %xmm12, 0xa0(%rsp)
1870 movaps %xmm13, 0xb0(%rsp)
1871 movaps %xmm14, 0xc0(%rsp)
1872 movaps %xmm15, 0xd0(%rsp)
1876 mov %rsp, %rbp # backup %rsp
1877 movdqu ($arg5), %xmm0 # load counter
1878 mov 240($arg4), %eax # rounds
1879 mov $arg1, $inp # backup arguments
1883 movdqa %xmm0, 0x20(%rbp) # copy counter
1887 mov %eax, %ebx # rounds
1888 shl \$7, %rax # 128 bytes per inner round key
1889 sub \$`128-32`, %rax # size of bit-sliced key schedule
1892 mov %rsp, %rax # pass key schedule
1893 mov $key, %rcx # pass key
1894 mov %ebx, %r10d # pass rounds
1895 call _bsaes_key_convert
1896 pxor %xmm6,%xmm7 # fix up last round key
1897 movdqa %xmm7,(%rax) # save last round key
1899 movdqa (%rsp), @XMM[9] # load round0 key
1900 lea .LADD1(%rip), %r11
1901 movdqa 0x20(%rbp), @XMM[0] # counter copy
1902 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1903 pshufb @XMM[8], @XMM[9] # byte swap upper part
1904 pshufb @XMM[8], @XMM[0]
1905 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1909 movdqa @XMM[0], 0x20(%rbp) # save counter
1910 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1911 movdqa @XMM[0], @XMM[2]
1912 paddd 0x00(%r11), @XMM[1] # .LADD1
1913 movdqa @XMM[0], @XMM[3]
1914 paddd 0x10(%r11), @XMM[2] # .LADD2
1915 movdqa @XMM[0], @XMM[4]
1916 paddd 0x20(%r11), @XMM[3] # .LADD3
1917 movdqa @XMM[0], @XMM[5]
1918 paddd 0x30(%r11), @XMM[4] # .LADD4
1919 movdqa @XMM[0], @XMM[6]
1920 paddd 0x40(%r11), @XMM[5] # .LADD5
1921 movdqa @XMM[0], @XMM[7]
1922 paddd 0x50(%r11), @XMM[6] # .LADD6
1923 paddd 0x60(%r11), @XMM[7] # .LADD7
1925 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1926 # to flip byte order in 32-bit counter
1927 movdqa (%rsp), @XMM[9] # round 0 key
1928 lea 0x10(%rsp), %rax # pass key schedule
1929 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1930 pxor @XMM[9], @XMM[0] # xor with round0 key
1931 pxor @XMM[9], @XMM[1]
1932 pxor @XMM[9], @XMM[2]
1933 pxor @XMM[9], @XMM[3]
1934 pshufb @XMM[8], @XMM[0]
1935 pshufb @XMM[8], @XMM[1]
1936 pxor @XMM[9], @XMM[4]
1937 pxor @XMM[9], @XMM[5]
1938 pshufb @XMM[8], @XMM[2]
1939 pshufb @XMM[8], @XMM[3]
1940 pxor @XMM[9], @XMM[6]
1941 pxor @XMM[9], @XMM[7]
1942 pshufb @XMM[8], @XMM[4]
1943 pshufb @XMM[8], @XMM[5]
1944 pshufb @XMM[8], @XMM[6]
1945 pshufb @XMM[8], @XMM[7]
1946 lea .LBS0(%rip), %r11 # constants table
1947 mov %ebx,%r10d # pass rounds
1949 call _bsaes_encrypt8_bitslice
1952 jc .Lctr_enc_loop_done
1954 movdqu 0x00($inp), @XMM[8] # load input
1955 movdqu 0x10($inp), @XMM[9]
1956 movdqu 0x20($inp), @XMM[10]
1957 movdqu 0x30($inp), @XMM[11]
1958 movdqu 0x40($inp), @XMM[12]
1959 movdqu 0x50($inp), @XMM[13]
1960 movdqu 0x60($inp), @XMM[14]
1961 movdqu 0x70($inp), @XMM[15]
1963 pxor @XMM[0], @XMM[8]
1964 movdqa 0x20(%rbp), @XMM[0] # load counter
1965 pxor @XMM[9], @XMM[1]
1966 movdqu @XMM[8], 0x00($out) # write output
1967 pxor @XMM[10], @XMM[4]
1968 movdqu @XMM[1], 0x10($out)
1969 pxor @XMM[11], @XMM[6]
1970 movdqu @XMM[4], 0x20($out)
1971 pxor @XMM[12], @XMM[3]
1972 movdqu @XMM[6], 0x30($out)
1973 pxor @XMM[13], @XMM[7]
1974 movdqu @XMM[3], 0x40($out)
1975 pxor @XMM[14], @XMM[2]
1976 movdqu @XMM[7], 0x50($out)
1977 pxor @XMM[15], @XMM[5]
1978 movdqu @XMM[2], 0x60($out)
1979 lea .LADD1(%rip), %r11
1980 movdqu @XMM[5], 0x70($out)
1981 lea 0x80($out), $out
1982 paddd 0x70(%r11), @XMM[0] # .LADD8
1987 .Lctr_enc_loop_done:
1989 movdqu 0x00($inp), @XMM[8] # load input
1990 pxor @XMM[8], @XMM[0]
1991 movdqu @XMM[0], 0x00($out) # write output
1994 movdqu 0x10($inp), @XMM[9]
1995 pxor @XMM[9], @XMM[1]
1996 movdqu @XMM[1], 0x10($out)
1998 movdqu 0x20($inp), @XMM[10]
1999 pxor @XMM[10], @XMM[4]
2000 movdqu @XMM[4], 0x20($out)
2003 movdqu 0x30($inp), @XMM[11]
2004 pxor @XMM[11], @XMM[6]
2005 movdqu @XMM[6], 0x30($out)
2007 movdqu 0x40($inp), @XMM[12]
2008 pxor @XMM[12], @XMM[3]
2009 movdqu @XMM[3], 0x40($out)
2012 movdqu 0x50($inp), @XMM[13]
2013 pxor @XMM[13], @XMM[7]
2014 movdqu @XMM[7], 0x50($out)
2016 movdqu 0x60($inp), @XMM[14]
2017 pxor @XMM[14], @XMM[2]
2018 movdqu @XMM[2], 0x60($out)
2023 lea 0x20(%rbp), $arg1
2024 lea 0x30(%rbp), $arg2
2026 call asm_AES_encrypt
2027 movdqu ($inp), @XMM[1]
2029 mov 0x2c(%rbp), %eax # load 32-bit counter
2031 pxor 0x30(%rbp), @XMM[1]
2032 inc %eax # increment
2033 movdqu @XMM[1], ($out)
2036 mov %eax, 0x2c(%rsp) # save 32-bit counter
2043 .Lctr_enc_bzero: # wipe key schedule [if any]
2044 movdqa %xmm0, 0x00(%rax)
2045 movdqa %xmm0, 0x10(%rax)
2046 lea 0x20(%rax), %rax
2050 lea (%rbp),%rsp # restore %rsp
2052 $code.=<<___ if ($win64);
2053 movaps 0x40(%rbp), %xmm6
2054 movaps 0x50(%rbp), %xmm7
2055 movaps 0x60(%rbp), %xmm8
2056 movaps 0x70(%rbp), %xmm9
2057 movaps 0x80(%rbp), %xmm10
2058 movaps 0x90(%rbp), %xmm11
2059 movaps 0xa0(%rbp), %xmm12
2060 movaps 0xb0(%rbp), %xmm13
2061 movaps 0xc0(%rbp), %xmm14
2062 movaps 0xd0(%rbp), %xmm15
2063 lea 0xa0(%rbp), %rsp
2066 mov 0x48(%rsp), %r15
2067 mov 0x50(%rsp), %r14
2068 mov 0x58(%rsp), %r13
2069 mov 0x60(%rsp), %r12
2070 mov 0x68(%rsp), %rbx
2071 mov 0x70(%rsp), %rax
2072 lea 0x78(%rsp), %rsp
2076 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
2078 ######################################################################
2079 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
2080 # const AES_KEY *key1, const AES_KEY *key2,
2081 # const unsigned char iv[16]);
2083 my ($twmask,$twres,$twtmp)=@XMM[13..15];
2087 .globl bsaes_xts_encrypt
2088 .type bsaes_xts_encrypt,\@abi-omnipotent
2099 lea -0x48(%rsp), %rsp
2101 $code.=<<___ if ($win64);
2102 mov 0xa0(%rsp),$arg5 # pull key2
2103 mov 0xa8(%rsp),$arg6 # pull ivp
2104 lea -0xa0(%rsp), %rsp
2105 movaps %xmm6, 0x40(%rsp)
2106 movaps %xmm7, 0x50(%rsp)
2107 movaps %xmm8, 0x60(%rsp)
2108 movaps %xmm9, 0x70(%rsp)
2109 movaps %xmm10, 0x80(%rsp)
2110 movaps %xmm11, 0x90(%rsp)
2111 movaps %xmm12, 0xa0(%rsp)
2112 movaps %xmm13, 0xb0(%rsp)
2113 movaps %xmm14, 0xc0(%rsp)
2114 movaps %xmm15, 0xd0(%rsp)
2118 mov %rsp, %rbp # backup %rsp
2119 mov $arg1, $inp # backup arguments
2125 lea 0x20(%rbp), $arg2
2127 call asm_AES_encrypt # generate initial tweak
2129 mov 240($key), %eax # rounds
2130 mov $len, %rbx # backup $len
2132 mov %eax, %edx # rounds
2133 shl \$7, %rax # 128 bytes per inner round key
2134 sub \$`128-32`, %rax # size of bit-sliced key schedule
2137 mov %rsp, %rax # pass key schedule
2138 mov $key, %rcx # pass key
2139 mov %edx, %r10d # pass rounds
2140 call _bsaes_key_convert
2141 pxor %xmm6, %xmm7 # fix up last round key
2142 movdqa %xmm7, (%rax) # save last round key
2145 sub \$0x80, %rsp # place for tweak[8]
2146 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2149 movdqa .Lxts_magic(%rip), $twmask
2150 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2159 for ($i=0;$i<7;$i++) {
2161 pshufd \$0x13, $twtmp, $twres
2163 movdqa @XMM[7], @XMM[$i]
2164 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2165 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2166 pand $twmask, $twres # isolate carry and residue
2167 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2168 pxor $twres, @XMM[7]
2170 $code.=<<___ if ($i>=1);
2171 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2173 $code.=<<___ if ($i>=2);
2174 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2178 movdqu 0x60($inp), @XMM[8+6]
2179 pxor @XMM[8+5], @XMM[5]
2180 movdqu 0x70($inp), @XMM[8+7]
2181 lea 0x80($inp), $inp
2182 movdqa @XMM[7], 0x70(%rsp)
2183 pxor @XMM[8+6], @XMM[6]
2184 lea 0x80(%rsp), %rax # pass key schedule
2185 pxor @XMM[8+7], @XMM[7]
2186 mov %edx, %r10d # pass rounds
2188 call _bsaes_encrypt8
2190 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2191 pxor 0x10(%rsp), @XMM[1]
2192 movdqu @XMM[0], 0x00($out) # write output
2193 pxor 0x20(%rsp), @XMM[4]
2194 movdqu @XMM[1], 0x10($out)
2195 pxor 0x30(%rsp), @XMM[6]
2196 movdqu @XMM[4], 0x20($out)
2197 pxor 0x40(%rsp), @XMM[3]
2198 movdqu @XMM[6], 0x30($out)
2199 pxor 0x50(%rsp), @XMM[7]
2200 movdqu @XMM[3], 0x40($out)
2201 pxor 0x60(%rsp), @XMM[2]
2202 movdqu @XMM[7], 0x50($out)
2203 pxor 0x70(%rsp), @XMM[5]
2204 movdqu @XMM[2], 0x60($out)
2205 movdqu @XMM[5], 0x70($out)
2206 lea 0x80($out), $out
2208 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2210 movdqa .Lxts_magic(%rip), $twmask
2211 pcmpgtd @XMM[7], $twtmp
2212 pshufd \$0x13, $twtmp, $twres
2214 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2215 pand $twmask, $twres # isolate carry and residue
2216 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2217 pxor $twres, @XMM[7]
2226 for ($i=0;$i<7;$i++) {
2228 pshufd \$0x13, $twtmp, $twres
2230 movdqa @XMM[7], @XMM[$i]
2231 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2232 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2233 pand $twmask, $twres # isolate carry and residue
2234 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2235 pxor $twres, @XMM[7]
2237 $code.=<<___ if ($i>=1);
2238 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2239 cmp \$`0x10*$i`,$len
2242 $code.=<<___ if ($i>=2);
2243 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2247 movdqu 0x60($inp), @XMM[8+6]
2248 pxor @XMM[8+5], @XMM[5]
2249 movdqa @XMM[7], 0x70(%rsp)
2250 lea 0x70($inp), $inp
2251 pxor @XMM[8+6], @XMM[6]
2252 lea 0x80(%rsp), %rax # pass key schedule
2253 mov %edx, %r10d # pass rounds
2255 call _bsaes_encrypt8
2257 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2258 pxor 0x10(%rsp), @XMM[1]
2259 movdqu @XMM[0], 0x00($out) # write output
2260 pxor 0x20(%rsp), @XMM[4]
2261 movdqu @XMM[1], 0x10($out)
2262 pxor 0x30(%rsp), @XMM[6]
2263 movdqu @XMM[4], 0x20($out)
2264 pxor 0x40(%rsp), @XMM[3]
2265 movdqu @XMM[6], 0x30($out)
2266 pxor 0x50(%rsp), @XMM[7]
2267 movdqu @XMM[3], 0x40($out)
2268 pxor 0x60(%rsp), @XMM[2]
2269 movdqu @XMM[7], 0x50($out)
2270 movdqu @XMM[2], 0x60($out)
2271 lea 0x70($out), $out
2273 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2277 pxor @XMM[8+4], @XMM[4]
2278 lea 0x60($inp), $inp
2279 pxor @XMM[8+5], @XMM[5]
2280 lea 0x80(%rsp), %rax # pass key schedule
2281 mov %edx, %r10d # pass rounds
2283 call _bsaes_encrypt8
2285 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2286 pxor 0x10(%rsp), @XMM[1]
2287 movdqu @XMM[0], 0x00($out) # write output
2288 pxor 0x20(%rsp), @XMM[4]
2289 movdqu @XMM[1], 0x10($out)
2290 pxor 0x30(%rsp), @XMM[6]
2291 movdqu @XMM[4], 0x20($out)
2292 pxor 0x40(%rsp), @XMM[3]
2293 movdqu @XMM[6], 0x30($out)
2294 pxor 0x50(%rsp), @XMM[7]
2295 movdqu @XMM[3], 0x40($out)
2296 movdqu @XMM[7], 0x50($out)
2297 lea 0x60($out), $out
2299 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2303 pxor @XMM[8+3], @XMM[3]
2304 lea 0x50($inp), $inp
2305 pxor @XMM[8+4], @XMM[4]
2306 lea 0x80(%rsp), %rax # pass key schedule
2307 mov %edx, %r10d # pass rounds
2309 call _bsaes_encrypt8
2311 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2312 pxor 0x10(%rsp), @XMM[1]
2313 movdqu @XMM[0], 0x00($out) # write output
2314 pxor 0x20(%rsp), @XMM[4]
2315 movdqu @XMM[1], 0x10($out)
2316 pxor 0x30(%rsp), @XMM[6]
2317 movdqu @XMM[4], 0x20($out)
2318 pxor 0x40(%rsp), @XMM[3]
2319 movdqu @XMM[6], 0x30($out)
2320 movdqu @XMM[3], 0x40($out)
2321 lea 0x50($out), $out
2323 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2327 pxor @XMM[8+2], @XMM[2]
2328 lea 0x40($inp), $inp
2329 pxor @XMM[8+3], @XMM[3]
2330 lea 0x80(%rsp), %rax # pass key schedule
2331 mov %edx, %r10d # pass rounds
2333 call _bsaes_encrypt8
2335 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2336 pxor 0x10(%rsp), @XMM[1]
2337 movdqu @XMM[0], 0x00($out) # write output
2338 pxor 0x20(%rsp), @XMM[4]
2339 movdqu @XMM[1], 0x10($out)
2340 pxor 0x30(%rsp), @XMM[6]
2341 movdqu @XMM[4], 0x20($out)
2342 movdqu @XMM[6], 0x30($out)
2343 lea 0x40($out), $out
2345 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2349 pxor @XMM[8+1], @XMM[1]
2350 lea 0x30($inp), $inp
2351 pxor @XMM[8+2], @XMM[2]
2352 lea 0x80(%rsp), %rax # pass key schedule
2353 mov %edx, %r10d # pass rounds
2355 call _bsaes_encrypt8
2357 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2358 pxor 0x10(%rsp), @XMM[1]
2359 movdqu @XMM[0], 0x00($out) # write output
2360 pxor 0x20(%rsp), @XMM[4]
2361 movdqu @XMM[1], 0x10($out)
2362 movdqu @XMM[4], 0x20($out)
2363 lea 0x30($out), $out
2365 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2369 pxor @XMM[8+0], @XMM[0]
2370 lea 0x20($inp), $inp
2371 pxor @XMM[8+1], @XMM[1]
2372 lea 0x80(%rsp), %rax # pass key schedule
2373 mov %edx, %r10d # pass rounds
2375 call _bsaes_encrypt8
2377 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2378 pxor 0x10(%rsp), @XMM[1]
2379 movdqu @XMM[0], 0x00($out) # write output
2380 movdqu @XMM[1], 0x10($out)
2381 lea 0x20($out), $out
2383 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2387 pxor @XMM[0], @XMM[8]
2388 lea 0x10($inp), $inp
2389 movdqa @XMM[8], 0x20(%rbp)
2390 lea 0x20(%rbp), $arg1
2391 lea 0x20(%rbp), $arg2
2393 call asm_AES_encrypt # doesn't touch %xmm
2394 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2395 #pxor @XMM[8], @XMM[0]
2396 #lea 0x80(%rsp), %rax # pass key schedule
2397 #mov %edx, %r10d # pass rounds
2398 #call _bsaes_encrypt8
2399 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2400 movdqu @XMM[0], 0x00($out) # write output
2401 lea 0x10($out), $out
2403 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2412 movzb -16(%rdx), %ecx
2420 movdqu -16($out), @XMM[0]
2421 lea 0x20(%rbp), $arg1
2422 pxor @XMM[7], @XMM[0]
2423 lea 0x20(%rbp), $arg2
2424 movdqa @XMM[0], 0x20(%rbp)
2426 call asm_AES_encrypt # doesn't touch %xmm
2427 pxor 0x20(%rbp), @XMM[7]
2428 movdqu @XMM[7], -16($out)
2433 .Lxts_enc_bzero: # wipe key schedule [if any]
2434 movdqa %xmm0, 0x00(%rax)
2435 movdqa %xmm0, 0x10(%rax)
2436 lea 0x20(%rax), %rax
2440 lea (%rbp),%rsp # restore %rsp
2442 $code.=<<___ if ($win64);
2443 movaps 0x40(%rbp), %xmm6
2444 movaps 0x50(%rbp), %xmm7
2445 movaps 0x60(%rbp), %xmm8
2446 movaps 0x70(%rbp), %xmm9
2447 movaps 0x80(%rbp), %xmm10
2448 movaps 0x90(%rbp), %xmm11
2449 movaps 0xa0(%rbp), %xmm12
2450 movaps 0xb0(%rbp), %xmm13
2451 movaps 0xc0(%rbp), %xmm14
2452 movaps 0xd0(%rbp), %xmm15
2453 lea 0xa0(%rbp), %rsp
2456 mov 0x48(%rsp), %r15
2457 mov 0x50(%rsp), %r14
2458 mov 0x58(%rsp), %r13
2459 mov 0x60(%rsp), %r12
2460 mov 0x68(%rsp), %rbx
2461 mov 0x70(%rsp), %rax
2462 lea 0x78(%rsp), %rsp
2466 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2468 .globl bsaes_xts_decrypt
2469 .type bsaes_xts_decrypt,\@abi-omnipotent
2480 lea -0x48(%rsp), %rsp
2482 $code.=<<___ if ($win64);
2483 mov 0xa0(%rsp),$arg5 # pull key2
2484 mov 0xa8(%rsp),$arg6 # pull ivp
2485 lea -0xa0(%rsp), %rsp
2486 movaps %xmm6, 0x40(%rsp)
2487 movaps %xmm7, 0x50(%rsp)
2488 movaps %xmm8, 0x60(%rsp)
2489 movaps %xmm9, 0x70(%rsp)
2490 movaps %xmm10, 0x80(%rsp)
2491 movaps %xmm11, 0x90(%rsp)
2492 movaps %xmm12, 0xa0(%rsp)
2493 movaps %xmm13, 0xb0(%rsp)
2494 movaps %xmm14, 0xc0(%rsp)
2495 movaps %xmm15, 0xd0(%rsp)
2499 mov %rsp, %rbp # backup %rsp
2500 mov $arg1, $inp # backup arguments
2506 lea 0x20(%rbp), $arg2
2508 call asm_AES_encrypt # generate initial tweak
2510 mov 240($key), %eax # rounds
2511 mov $len, %rbx # backup $len
2513 mov %eax, %edx # rounds
2514 shl \$7, %rax # 128 bytes per inner round key
2515 sub \$`128-32`, %rax # size of bit-sliced key schedule
2518 mov %rsp, %rax # pass key schedule
2519 mov $key, %rcx # pass key
2520 mov %edx, %r10d # pass rounds
2521 call _bsaes_key_convert
2522 pxor (%rsp), %xmm7 # fix up round 0 key
2523 movdqa %xmm6, (%rax) # save last round key
2524 movdqa %xmm7, (%rsp)
2526 xor %eax, %eax # if ($len%16) len-=16;
2533 sub \$0x80, %rsp # place for tweak[8]
2534 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2537 movdqa .Lxts_magic(%rip), $twmask
2538 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2547 for ($i=0;$i<7;$i++) {
2549 pshufd \$0x13, $twtmp, $twres
2551 movdqa @XMM[7], @XMM[$i]
2552 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2553 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2554 pand $twmask, $twres # isolate carry and residue
2555 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2556 pxor $twres, @XMM[7]
2558 $code.=<<___ if ($i>=1);
2559 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2561 $code.=<<___ if ($i>=2);
2562 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2566 movdqu 0x60($inp), @XMM[8+6]
2567 pxor @XMM[8+5], @XMM[5]
2568 movdqu 0x70($inp), @XMM[8+7]
2569 lea 0x80($inp), $inp
2570 movdqa @XMM[7], 0x70(%rsp)
2571 pxor @XMM[8+6], @XMM[6]
2572 lea 0x80(%rsp), %rax # pass key schedule
2573 pxor @XMM[8+7], @XMM[7]
2574 mov %edx, %r10d # pass rounds
2576 call _bsaes_decrypt8
2578 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2579 pxor 0x10(%rsp), @XMM[1]
2580 movdqu @XMM[0], 0x00($out) # write output
2581 pxor 0x20(%rsp), @XMM[6]
2582 movdqu @XMM[1], 0x10($out)
2583 pxor 0x30(%rsp), @XMM[4]
2584 movdqu @XMM[6], 0x20($out)
2585 pxor 0x40(%rsp), @XMM[2]
2586 movdqu @XMM[4], 0x30($out)
2587 pxor 0x50(%rsp), @XMM[7]
2588 movdqu @XMM[2], 0x40($out)
2589 pxor 0x60(%rsp), @XMM[3]
2590 movdqu @XMM[7], 0x50($out)
2591 pxor 0x70(%rsp), @XMM[5]
2592 movdqu @XMM[3], 0x60($out)
2593 movdqu @XMM[5], 0x70($out)
2594 lea 0x80($out), $out
2596 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2598 movdqa .Lxts_magic(%rip), $twmask
2599 pcmpgtd @XMM[7], $twtmp
2600 pshufd \$0x13, $twtmp, $twres
2602 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2603 pand $twmask, $twres # isolate carry and residue
2604 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2605 pxor $twres, @XMM[7]
2614 for ($i=0;$i<7;$i++) {
2616 pshufd \$0x13, $twtmp, $twres
2618 movdqa @XMM[7], @XMM[$i]
2619 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2620 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2621 pand $twmask, $twres # isolate carry and residue
2622 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2623 pxor $twres, @XMM[7]
2625 $code.=<<___ if ($i>=1);
2626 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2627 cmp \$`0x10*$i`,$len
2630 $code.=<<___ if ($i>=2);
2631 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2635 movdqu 0x60($inp), @XMM[8+6]
2636 pxor @XMM[8+5], @XMM[5]
2637 movdqa @XMM[7], 0x70(%rsp)
2638 lea 0x70($inp), $inp
2639 pxor @XMM[8+6], @XMM[6]
2640 lea 0x80(%rsp), %rax # pass key schedule
2641 mov %edx, %r10d # pass rounds
2643 call _bsaes_decrypt8
2645 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2646 pxor 0x10(%rsp), @XMM[1]
2647 movdqu @XMM[0], 0x00($out) # write output
2648 pxor 0x20(%rsp), @XMM[6]
2649 movdqu @XMM[1], 0x10($out)
2650 pxor 0x30(%rsp), @XMM[4]
2651 movdqu @XMM[6], 0x20($out)
2652 pxor 0x40(%rsp), @XMM[2]
2653 movdqu @XMM[4], 0x30($out)
2654 pxor 0x50(%rsp), @XMM[7]
2655 movdqu @XMM[2], 0x40($out)
2656 pxor 0x60(%rsp), @XMM[3]
2657 movdqu @XMM[7], 0x50($out)
2658 movdqu @XMM[3], 0x60($out)
2659 lea 0x70($out), $out
2661 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2665 pxor @XMM[8+4], @XMM[4]
2666 lea 0x60($inp), $inp
2667 pxor @XMM[8+5], @XMM[5]
2668 lea 0x80(%rsp), %rax # pass key schedule
2669 mov %edx, %r10d # pass rounds
2671 call _bsaes_decrypt8
2673 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2674 pxor 0x10(%rsp), @XMM[1]
2675 movdqu @XMM[0], 0x00($out) # write output
2676 pxor 0x20(%rsp), @XMM[6]
2677 movdqu @XMM[1], 0x10($out)
2678 pxor 0x30(%rsp), @XMM[4]
2679 movdqu @XMM[6], 0x20($out)
2680 pxor 0x40(%rsp), @XMM[2]
2681 movdqu @XMM[4], 0x30($out)
2682 pxor 0x50(%rsp), @XMM[7]
2683 movdqu @XMM[2], 0x40($out)
2684 movdqu @XMM[7], 0x50($out)
2685 lea 0x60($out), $out
2687 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2691 pxor @XMM[8+3], @XMM[3]
2692 lea 0x50($inp), $inp
2693 pxor @XMM[8+4], @XMM[4]
2694 lea 0x80(%rsp), %rax # pass key schedule
2695 mov %edx, %r10d # pass rounds
2697 call _bsaes_decrypt8
2699 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2700 pxor 0x10(%rsp), @XMM[1]
2701 movdqu @XMM[0], 0x00($out) # write output
2702 pxor 0x20(%rsp), @XMM[6]
2703 movdqu @XMM[1], 0x10($out)
2704 pxor 0x30(%rsp), @XMM[4]
2705 movdqu @XMM[6], 0x20($out)
2706 pxor 0x40(%rsp), @XMM[2]
2707 movdqu @XMM[4], 0x30($out)
2708 movdqu @XMM[2], 0x40($out)
2709 lea 0x50($out), $out
2711 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2715 pxor @XMM[8+2], @XMM[2]
2716 lea 0x40($inp), $inp
2717 pxor @XMM[8+3], @XMM[3]
2718 lea 0x80(%rsp), %rax # pass key schedule
2719 mov %edx, %r10d # pass rounds
2721 call _bsaes_decrypt8
2723 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2724 pxor 0x10(%rsp), @XMM[1]
2725 movdqu @XMM[0], 0x00($out) # write output
2726 pxor 0x20(%rsp), @XMM[6]
2727 movdqu @XMM[1], 0x10($out)
2728 pxor 0x30(%rsp), @XMM[4]
2729 movdqu @XMM[6], 0x20($out)
2730 movdqu @XMM[4], 0x30($out)
2731 lea 0x40($out), $out
2733 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2737 pxor @XMM[8+1], @XMM[1]
2738 lea 0x30($inp), $inp
2739 pxor @XMM[8+2], @XMM[2]
2740 lea 0x80(%rsp), %rax # pass key schedule
2741 mov %edx, %r10d # pass rounds
2743 call _bsaes_decrypt8
2745 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2746 pxor 0x10(%rsp), @XMM[1]
2747 movdqu @XMM[0], 0x00($out) # write output
2748 pxor 0x20(%rsp), @XMM[6]
2749 movdqu @XMM[1], 0x10($out)
2750 movdqu @XMM[6], 0x20($out)
2751 lea 0x30($out), $out
2753 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2757 pxor @XMM[8+0], @XMM[0]
2758 lea 0x20($inp), $inp
2759 pxor @XMM[8+1], @XMM[1]
2760 lea 0x80(%rsp), %rax # pass key schedule
2761 mov %edx, %r10d # pass rounds
2763 call _bsaes_decrypt8
2765 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2766 pxor 0x10(%rsp), @XMM[1]
2767 movdqu @XMM[0], 0x00($out) # write output
2768 movdqu @XMM[1], 0x10($out)
2769 lea 0x20($out), $out
2771 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2775 pxor @XMM[0], @XMM[8]
2776 lea 0x10($inp), $inp
2777 movdqa @XMM[8], 0x20(%rbp)
2778 lea 0x20(%rbp), $arg1
2779 lea 0x20(%rbp), $arg2
2781 call asm_AES_decrypt # doesn't touch %xmm
2782 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2783 #pxor @XMM[8], @XMM[0]
2784 #lea 0x80(%rsp), %rax # pass key schedule
2785 #mov %edx, %r10d # pass rounds
2786 #call _bsaes_decrypt8
2787 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2788 movdqu @XMM[0], 0x00($out) # write output
2789 lea 0x10($out), $out
2791 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2798 movdqa .Lxts_magic(%rip), $twmask
2799 pcmpgtd @XMM[7], $twtmp
2800 pshufd \$0x13, $twtmp, $twres
2801 movdqa @XMM[7], @XMM[6]
2802 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2803 pand $twmask, $twres # isolate carry and residue
2804 movdqu ($inp), @XMM[0]
2805 pxor $twres, @XMM[7]
2807 lea 0x20(%rbp), $arg1
2808 pxor @XMM[7], @XMM[0]
2809 lea 0x20(%rbp), $arg2
2810 movdqa @XMM[0], 0x20(%rbp)
2812 call asm_AES_decrypt # doesn't touch %xmm
2813 pxor 0x20(%rbp), @XMM[7]
2815 movdqu @XMM[7], ($out)
2818 movzb 16($inp), %eax
2827 movdqu ($out), @XMM[0]
2828 lea 0x20(%rbp), $arg1
2829 pxor @XMM[6], @XMM[0]
2830 lea 0x20(%rbp), $arg2
2831 movdqa @XMM[0], 0x20(%rbp)
2833 call asm_AES_decrypt # doesn't touch %xmm
2834 pxor 0x20(%rbp), @XMM[6]
2835 movdqu @XMM[6], ($out)
2840 .Lxts_dec_bzero: # wipe key schedule [if any]
2841 movdqa %xmm0, 0x00(%rax)
2842 movdqa %xmm0, 0x10(%rax)
2843 lea 0x20(%rax), %rax
2847 lea (%rbp),%rsp # restore %rsp
2849 $code.=<<___ if ($win64);
2850 movaps 0x40(%rbp), %xmm6
2851 movaps 0x50(%rbp), %xmm7
2852 movaps 0x60(%rbp), %xmm8
2853 movaps 0x70(%rbp), %xmm9
2854 movaps 0x80(%rbp), %xmm10
2855 movaps 0x90(%rbp), %xmm11
2856 movaps 0xa0(%rbp), %xmm12
2857 movaps 0xb0(%rbp), %xmm13
2858 movaps 0xc0(%rbp), %xmm14
2859 movaps 0xd0(%rbp), %xmm15
2860 lea 0xa0(%rbp), %rsp
2863 mov 0x48(%rsp), %r15
2864 mov 0x50(%rsp), %r14
2865 mov 0x58(%rsp), %r13
2866 mov 0x60(%rsp), %r12
2867 mov 0x68(%rsp), %rbx
2868 mov 0x70(%rsp), %rax
2869 lea 0x78(%rsp), %rsp
2873 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2877 .type _bsaes_const,\@object
2880 .LM0ISR: # InvShiftRows constants
2881 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2883 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2885 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2886 .LBS0: # bit-slice constants
2887 .quad 0x5555555555555555, 0x5555555555555555
2889 .quad 0x3333333333333333, 0x3333333333333333
2891 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2892 .LSR: # shiftrows constants
2893 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2895 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2897 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2898 .LSWPUP: # byte-swap upper dword
2899 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2901 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2902 .LADD1: # counter increment constants
2903 .quad 0x0000000000000000, 0x0000000100000000
2905 .quad 0x0000000000000000, 0x0000000200000000
2907 .quad 0x0000000000000000, 0x0000000300000000
2909 .quad 0x0000000000000000, 0x0000000400000000
2911 .quad 0x0000000000000000, 0x0000000500000000
2913 .quad 0x0000000000000000, 0x0000000600000000
2915 .quad 0x0000000000000000, 0x0000000700000000
2917 .quad 0x0000000000000000, 0x0000000800000000
2921 .quad 0x0101010101010101, 0x0101010101010101
2922 .quad 0x0202020202020202, 0x0202020202020202
2923 .quad 0x0404040404040404, 0x0404040404040404
2924 .quad 0x0808080808080808, 0x0808080808080808
2926 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2928 .quad 0x6363636363636363, 0x6363636363636363
2929 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2931 .size _bsaes_const,.-_bsaes_const
2934 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2935 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2943 .extern __imp_RtlVirtualUnwind
2944 .type se_handler,\@abi-omnipotent
2958 mov 120($context),%rax # pull context->Rax
2959 mov 248($context),%rbx # pull context->Rip
2961 mov 8($disp),%rsi # disp->ImageBase
2962 mov 56($disp),%r11 # disp->HandlerData
2964 mov 0(%r11),%r10d # HandlerData[0]
2965 lea (%rsi,%r10),%r10 # prologue label
2966 cmp %r10,%rbx # context->Rip<prologue label
2969 mov 152($context),%rax # pull context->Rsp
2971 mov 4(%r11),%r10d # HandlerData[1]
2972 lea (%rsi,%r10),%r10 # epilogue label
2973 cmp %r10,%rbx # context->Rip>=epilogue label
2976 mov 160($context),%rax # pull context->Rbp
2978 lea 0x40(%rax),%rsi # %xmm save area
2979 lea 512($context),%rdi # &context.Xmm6
2980 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2981 .long 0xa548f3fc # cld; rep movsq
2982 lea 0xa0(%rax),%rax # adjust stack pointer
2990 lea 0x78(%rax),%rax # adjust stack pointer
2991 mov %rbx,144($context) # restore context->Rbx
2992 mov %rbp,160($context) # restore context->Rbp
2993 mov %r12,216($context) # restore context->R12
2994 mov %r13,224($context) # restore context->R13
2995 mov %r14,232($context) # restore context->R14
2996 mov %r15,240($context) # restore context->R15
2999 mov %rax,152($context) # restore context->Rsp
3001 mov 40($disp),%rdi # disp->ContextRecord
3002 mov $context,%rsi # context
3003 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
3004 .long 0xa548f3fc # cld; rep movsq
3007 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
3008 mov 8(%rsi),%rdx # arg2, disp->ImageBase
3009 mov 0(%rsi),%r8 # arg3, disp->ControlPc
3010 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
3011 mov 40(%rsi),%r10 # disp->ContextRecord
3012 lea 56(%rsi),%r11 # &disp->HandlerData
3013 lea 24(%rsi),%r12 # &disp->EstablisherFrame
3014 mov %r10,32(%rsp) # arg5
3015 mov %r11,40(%rsp) # arg6
3016 mov %r12,48(%rsp) # arg7
3017 mov %rcx,56(%rsp) # arg8, (NULL)
3018 call *__imp_RtlVirtualUnwind(%rip)
3020 mov \$1,%eax # ExceptionContinueSearch
3032 .size se_handler,.-se_handler
3037 $code.=<<___ if ($ecb);
3038 .rva .Lecb_enc_prologue
3039 .rva .Lecb_enc_epilogue
3042 .rva .Lecb_dec_prologue
3043 .rva .Lecb_dec_epilogue
3047 .rva .Lcbc_dec_prologue
3048 .rva .Lcbc_dec_epilogue
3051 .rva .Lctr_enc_prologue
3052 .rva .Lctr_enc_epilogue
3055 .rva .Lxts_enc_prologue
3056 .rva .Lxts_enc_epilogue
3059 .rva .Lxts_dec_prologue
3060 .rva .Lxts_dec_epilogue
3066 $code.=<<___ if ($ecb);
3070 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
3074 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
3080 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
3084 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
3088 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
3092 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3096 $code =~ s/\`([^\`]*)\`/eval($1)/gem;