3 ###################################################################
4 ### AES-128 [originally in CTR mode] ###
5 ### bitsliced implementation for Intel Core 2 processors ###
6 ### requires support of SSE extensions up to SSSE3 ###
7 ### Author: Emilia Käsper and Peter Schwabe ###
8 ### Date: 2009-03-19 ###
11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12 ### further information. ###
13 ###################################################################
17 # Started as transliteration to "perlasm" the original code has
18 # undergone following changes:
20 # - code was made position-independent;
21 # - rounds were folded into a loop resulting in >5x size reduction
22 # from 12.5KB to 2.2KB;
23 # - above was possibile thanks to mixcolumns() modification that
24 # allowed to feed its output back to aesenc[last], this was
25 # achieved at cost of two additional inter-registers moves;
26 # - some instruction reordering and interleaving;
27 # - this module doesn't implement key setup subroutine, instead it
28 # relies on conversion of "conventional" key schedule as returned
29 # by AES_set_encrypt_key (see discussion below);
30 # - first and last round keys are treated differently, which allowed
31 # to skip one shiftrows(), reduce bit-sliced key schedule and
32 # speed-up conversion by 22%;
33 # - support for 192- and 256-bit keys was added;
35 # Resulting performance in CPU cycles spent to encrypt one byte out
36 # of 4096-byte buffer with 128-bit key is:
38 # Emilia's this(*) difference
40 # Core 2 9.30 8.69 +7%
41 # Nehalem(**) 7.63 6.98 +9%
42 # Atom 17.1 17.4 -2%(***)
44 # (*) Comparison is not completely fair, because "this" is ECB,
45 # i.e. no extra processing such as counter values calculation
46 # and xor-ing input as in Emilia's CTR implementation is
47 # performed. However, the CTR calculations stand for not more
48 # than 1% of total time, so comparison is *rather* fair.
50 # (**) Results were collected on Westmere, which is considered to
51 # be equivalent to Nehalem for this code.
53 # (***) Slowdown on Atom is rather strange per se, because original
54 # implementation has a number of 9+-bytes instructions, which
55 # are bad for Atom front-end, and which I eliminated completely.
56 # In attempt to address deterioration sbox() was tested in FP
57 # SIMD "domain" (movaps instead of movdqa, xorps instead of
58 # pxor, etc.). While it resulted in nominal 4% improvement on
59 # Atom, it hurted Westmere by more than 2x factor.
61 # As for key schedule conversion subroutine. Interface to OpenSSL
62 # relies on per-invocation on-the-fly conversion. This naturally
63 # has impact on performance, especially for short inputs. Conversion
64 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
67 # conversion conversion/8x block
72 # The ratio values mean that 128-byte blocks will be processed
73 # 21-27% slower, 256-byte blocks - 12-16%, 384-byte blocks - 8-11%,
74 # etc. Then keep in mind that input sizes not divisible by 128 are
75 # *effectively* slower, especially shortest ones, e.g. consecutive
76 # 144-byte blocks are processed 44% slower than one would expect,
77 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78 # it's still faster than ["hyper-threading-safe" code path in]
79 # aes-x86_64.pl on all lengths above 64 bytes...
83 # Add decryption procedure. Performance in CPU cycles spent to decrypt
84 # one byte out of 4096-byte buffer with 128-bit key is:
91 # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
92 # suboptimal, but XTS is meant to be used with larger blocks...
98 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
100 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
102 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
103 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
104 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
105 die "can't locate x86_64-xlate.pl";
107 open STDOUT,"| $^X $xlate $flavour $output";
109 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
110 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
111 my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
114 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
117 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
118 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
123 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
124 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
128 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
129 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
151 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
152 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
172 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
173 # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
177 &InvInBasisChange (@b);
178 &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
179 &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
182 sub InvInBasisChange { # OutBasisChange in reverse
183 my @b=@_[5,1,2,6,3,7,0,4];
201 sub InvOutBasisChange { # InBasisChange in reverse
202 my @b=@_[2,5,7,3,6,1,0,4];
223 #;*************************************************************
224 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
225 #;*************************************************************
226 my ($x0,$x1,$y0,$y1,$t0)=@_;
239 sub Mul_GF4_N { # not used, see next subroutine
240 # multiply and scale by N
241 my ($x0,$x1,$y0,$y1,$t0)=@_;
255 # interleaved Mul_GF4_N and Mul_GF4
256 my ($x0,$x1,$y0,$y1,$t0,
257 $x2,$x3,$y2,$y3,$t1)=@_;
285 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
292 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
293 @x[2], @x[3], @y[2], @y[3], @t[2]);
305 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
306 @x[6], @x[7], @y[2], @y[3], @t[2]);
311 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
320 #;********************************************************************
321 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
322 #;********************************************************************
326 # direct optimizations from hardware
381 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
383 # new smaller inversion
417 # output in s3, s2, s1, t1
419 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
421 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
422 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
424 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
427 # AES linear components
433 pxor 0x00($key),@x[0]
434 pxor 0x10($key),@x[1]
436 pxor 0x20($key),@x[2]
438 pxor 0x30($key),@x[3]
440 pxor 0x40($key),@x[4]
442 pxor 0x50($key),@x[5]
444 pxor 0x60($key),@x[6]
446 pxor 0x70($key),@x[7]
454 # modified to emit output in order suitable for feeding back to aesenc[last]
458 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
459 pshufd \$0x93, @x[1], @t[1]
460 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
461 pshufd \$0x93, @x[2], @t[2]
463 pshufd \$0x93, @x[3], @t[3]
465 pshufd \$0x93, @x[4], @t[4]
467 pshufd \$0x93, @x[5], @t[5]
469 pshufd \$0x93, @x[6], @t[6]
471 pshufd \$0x93, @x[7], @t[7]
478 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
480 pshufd \$0x4E, @x[1], @x[1]
486 pshufd \$0x4E, @x[4], @t[0]
488 pshufd \$0x4E, @x[5], @t[1]
490 pshufd \$0x4E, @x[3], @x[4]
492 pshufd \$0x4E, @x[7], @x[5]
494 pshufd \$0x4E, @x[6], @x[3]
496 pshufd \$0x4E, @x[2], @x[6]
513 # multiplication by 0x0e
514 pshufd \$0x93, @x[7], @t[7]
516 pxor @x[5], @x[7] # 7 5
517 pxor @x[5], @x[2] # 2 5
518 pshufd \$0x93, @x[0], @t[0]
520 pxor @x[0], @x[5] # 5 0 [1]
521 pxor @x[1], @x[0] # 0 1
522 pshufd \$0x93, @x[1], @t[1]
523 pxor @x[2], @x[1] # 1 25
524 pxor @x[6], @x[0] # 01 6 [2]
525 pxor @x[3], @x[1] # 125 3 [4]
526 pshufd \$0x93, @x[3], @t[3]
527 pxor @x[0], @x[2] # 25 016 [3]
528 pxor @x[7], @x[3] # 3 75
529 pxor @x[6], @x[7] # 75 6 [0]
530 pshufd \$0x93, @x[6], @t[6]
532 pxor @x[4], @x[6] # 6 4
533 pxor @x[3], @x[4] # 4 375 [6]
534 pxor @x[7], @x[3] # 375 756=36
535 pxor @t[5], @x[6] # 64 5 [7]
536 pxor @t[2], @x[3] # 36 2
537 pxor @t[4], @x[3] # 362 4 [5]
538 pshufd \$0x93, @t[5], @t[5]
540 my @y = @x[7,5,0,2,1,3,4,6];
542 # multiplication by 0x0b
546 pshufd \$0x93, @t[2], @t[2]
550 pshufd \$0x93, @t[4], @t[4]
551 pxor @t[6], @t[7] # clobber t[7]
555 pshufd \$0x93, @t[0], @t[0]
559 pshufd \$0x93, @t[1], @t[1]
563 pshufd \$0x93, @t[2], @t[2]
567 pshufd \$0x93, @t[3], @t[3]
573 pxor @t[5], @t[7] # clobber t[7] even more
576 pshufd \$0x93, @t[4], @t[4]
581 pshufd \$0x93, @t[5], @t[5]
582 pxor @t[6], @t[7] # restore t[7]
584 # multiplication by 0x0d
587 pshufd \$0x93, @t[6], @t[6]
591 pshufd \$0x93, @t[7], @t[7]
600 pshufd \$0x93, @t[0], @t[0]
604 pshufd \$0x93, @t[1], @t[1]
609 pshufd \$0x93, @t[2], @t[2]
611 pxor @t[3], @t[6] # clobber t[6]
618 pshufd \$0x93, @t[4], @t[4]
621 pxor @t[3], @t[6] # restore t[6]
623 pshufd \$0x93, @t[5], @t[5]
624 pshufd \$0x93, @t[6], @t[6]
625 pshufd \$0x93, @t[7], @t[7]
626 pshufd \$0x93, @t[3], @t[3]
628 # multiplication by 0x09
630 pxor @y[1], @t[1] # t[1]=y[1]
631 pxor @t[5], @t[0] # clobber t[0]
634 pxor @y[0], @t[0] # t[0]=y[0]
636 pxor @t[7], @t[6] # clobber t[6]
639 pxor @y[4], @t[4] # t[4]=y[4]
641 pxor @y[3], @t[3] # t[3]=y[3]
643 pxor @y[2], @t[2] # t[2]=y[2]
645 pxor @y[5], @t[5] # t[5]=y[5]
648 pxor @y[6], @t[6] # t[6]=y[6]
649 pxor @y[7], @t[7] # t[7]=y[7]
662 sub aesenc { # not used
666 movdqa 0x30($const),@t[0] # .LSR
668 &ShiftRows (@b,@t[0]);
670 &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
673 sub aesenclast { # not used
677 movdqa 0x40($const),@t[0] # .LSRM0
679 &ShiftRows (@b,@t[0]);
682 pxor 0x00($key),@b[0]
683 pxor 0x10($key),@b[1]
684 pxor 0x20($key),@b[4]
685 pxor 0x30($key),@b[6]
686 pxor 0x40($key),@b[3]
687 pxor 0x50($key),@b[7]
688 pxor 0x60($key),@b[2]
689 pxor 0x70($key),@b[5]
694 my ($a,$b,$n,$mask,$t)=@_;
706 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
726 my @x=reverse(@_[0..7]);
727 my ($t0,$t1,$t2,$t3)=@_[8..11];
729 movdqa 0x00($const),$t0 # .LBS0
730 movdqa 0x10($const),$t1 # .LBS1
732 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
733 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
735 movdqa 0x20($const),$t0 # .LBS2
737 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
738 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
740 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
741 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
747 .extern asm_AES_encrypt
748 .extern asm_AES_decrypt
750 .type _bsaes_encrypt8,\@abi-omnipotent
753 lea .LBS0(%rip), $const # constants table
755 movdqa ($key), @XMM[9] # round 0 key
757 movdqa 0x60($const), @XMM[8] # .LM0SR
758 pxor @XMM[9], @XMM[0] # xor with round0 key
759 pxor @XMM[9], @XMM[1]
760 pshufb @XMM[8], @XMM[0]
761 pxor @XMM[9], @XMM[2]
762 pshufb @XMM[8], @XMM[1]
763 pxor @XMM[9], @XMM[3]
764 pshufb @XMM[8], @XMM[2]
765 pxor @XMM[9], @XMM[4]
766 pshufb @XMM[8], @XMM[3]
767 pxor @XMM[9], @XMM[5]
768 pshufb @XMM[8], @XMM[4]
769 pxor @XMM[9], @XMM[6]
770 pshufb @XMM[8], @XMM[5]
771 pxor @XMM[9], @XMM[7]
772 pshufb @XMM[8], @XMM[6]
773 pshufb @XMM[8], @XMM[7]
774 _bsaes_encrypt8_bitslice:
776 &bitslice (@XMM[0..7, 8..11]);
783 &ShiftRows (@XMM[0..7, 8]);
784 $code.=".Lenc_sbox:\n";
785 &Sbox (@XMM[0..7, 8..15]);
790 &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
792 movdqa 0x30($const), @XMM[8] # .LSR
794 movdqa 0x40($const), @XMM[8] # .LSRM0
799 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
800 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
802 movdqa ($key), @XMM[8] # last round key
803 pxor @XMM[8], @XMM[4]
804 pxor @XMM[8], @XMM[6]
805 pxor @XMM[8], @XMM[3]
806 pxor @XMM[8], @XMM[7]
807 pxor @XMM[8], @XMM[2]
808 pxor @XMM[8], @XMM[5]
809 pxor @XMM[8], @XMM[0]
810 pxor @XMM[8], @XMM[1]
812 .size _bsaes_encrypt8,.-_bsaes_encrypt8
814 .type _bsaes_decrypt8,\@abi-omnipotent
817 lea .LBS0(%rip), $const # constants table
819 movdqa ($key), @XMM[9] # round 0 key
821 movdqa -0x30($const), @XMM[8] # .LM0ISR
822 pxor @XMM[9], @XMM[0] # xor with round0 key
823 pxor @XMM[9], @XMM[1]
824 pshufb @XMM[8], @XMM[0]
825 pxor @XMM[9], @XMM[2]
826 pshufb @XMM[8], @XMM[1]
827 pxor @XMM[9], @XMM[3]
828 pshufb @XMM[8], @XMM[2]
829 pxor @XMM[9], @XMM[4]
830 pshufb @XMM[8], @XMM[3]
831 pxor @XMM[9], @XMM[5]
832 pshufb @XMM[8], @XMM[4]
833 pxor @XMM[9], @XMM[6]
834 pshufb @XMM[8], @XMM[5]
835 pxor @XMM[9], @XMM[7]
836 pshufb @XMM[8], @XMM[6]
837 pshufb @XMM[8], @XMM[7]
839 &bitslice (@XMM[0..7, 8..11]);
846 &ShiftRows (@XMM[0..7, 8]);
847 $code.=".Ldec_sbox:\n";
848 &InvSbox (@XMM[0..7, 8..15]);
853 &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
855 movdqa -0x10($const), @XMM[8] # .LISR
857 movdqa -0x20($const), @XMM[8] # .LISRM0
862 &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
864 movdqa ($key), @XMM[8] # last round key
865 pxor @XMM[8], @XMM[6]
866 pxor @XMM[8], @XMM[4]
867 pxor @XMM[8], @XMM[2]
868 pxor @XMM[8], @XMM[7]
869 pxor @XMM[8], @XMM[3]
870 pxor @XMM[8], @XMM[5]
871 pxor @XMM[8], @XMM[0]
872 pxor @XMM[8], @XMM[1]
874 .size _bsaes_decrypt8,.-_bsaes_decrypt8
878 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
881 my @x=reverse(@_[0..7]);
882 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
884 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
886 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
890 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
892 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
894 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
900 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
901 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
905 .type _bsaes_key_convert,\@abi-omnipotent
908 lea .LBS1(%rip), $const
909 movdqu ($inp), %xmm7 # load round 0 key
910 movdqa -0x10($const), %xmm8 # .LBS0
911 movdqa 0x00($const), %xmm9 # .LBS1
912 movdqa 0x10($const), %xmm10 # .LBS2
913 movdqa 0x40($const), %xmm13 # .LM0
914 movdqa 0x60($const), %xmm14 # .LNOT
916 movdqu 0x10($inp), %xmm6 # load round 1 key
918 movdqa %xmm7, ($out) # save round 0 key
924 pshufb %xmm13, %xmm6 # .LM0
927 &bitslice_key (map("%xmm$_",(0..7, 8..12)));
929 pxor %xmm14, %xmm5 # "pnot"
934 movdqa %xmm0, 0x00($out) # write bit-sliced round key
935 movdqa %xmm1, 0x10($out)
936 movdqa %xmm2, 0x20($out)
937 movdqa %xmm3, 0x30($out)
938 movdqa %xmm4, 0x40($out)
939 movdqa %xmm5, 0x50($out)
940 movdqa %xmm6, 0x60($out)
941 movdqa %xmm7, 0x70($out)
943 movdqu ($inp), %xmm6 # load next round key
947 movdqa 0x70($const), %xmm7 # .L63
948 #movdqa %xmm6, ($out) # don't save last round key
950 .size _bsaes_key_convert,.-_bsaes_key_convert
954 if (0 && !$win64) { # following four functions are unsupported interface
955 # used for benchmarking...
957 .globl bsaes_enc_key_convert
958 .type bsaes_enc_key_convert,\@function,2
960 bsaes_enc_key_convert:
961 mov 240($inp),%r10d # pass rounds
962 mov $inp,%rcx # pass key
963 mov $out,%rax # pass key schedule
964 call _bsaes_key_convert
965 pxor %xmm6,%xmm7 # fix up last round key
966 movdqa %xmm7,(%rax) # save last round key
968 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
970 .globl bsaes_encrypt_128
971 .type bsaes_encrypt_128,\@function,4
975 movdqu 0x00($inp), @XMM[0] # load input
976 movdqu 0x10($inp), @XMM[1]
977 movdqu 0x20($inp), @XMM[2]
978 movdqu 0x30($inp), @XMM[3]
979 movdqu 0x40($inp), @XMM[4]
980 movdqu 0x50($inp), @XMM[5]
981 movdqu 0x60($inp), @XMM[6]
982 movdqu 0x70($inp), @XMM[7]
983 mov $key, %rax # pass the $key
989 movdqu @XMM[0], 0x00($out) # write output
990 movdqu @XMM[1], 0x10($out)
991 movdqu @XMM[4], 0x20($out)
992 movdqu @XMM[6], 0x30($out)
993 movdqu @XMM[3], 0x40($out)
994 movdqu @XMM[7], 0x50($out)
995 movdqu @XMM[2], 0x60($out)
996 movdqu @XMM[5], 0x70($out)
1001 .size bsaes_encrypt_128,.-bsaes_encrypt_128
1003 .globl bsaes_dec_key_convert
1004 .type bsaes_dec_key_convert,\@function,2
1006 bsaes_dec_key_convert:
1007 mov 240($inp),%r10d # pass rounds
1008 mov $inp,%rcx # pass key
1009 mov $out,%rax # pass key schedule
1010 call _bsaes_key_convert
1011 pxor ($out),%xmm7 # fix up round 0 key
1012 movdqa %xmm6,(%rax) # save last round key
1015 .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
1017 .globl bsaes_decrypt_128
1018 .type bsaes_decrypt_128,\@function,4
1022 movdqu 0x00($inp), @XMM[0] # load input
1023 movdqu 0x10($inp), @XMM[1]
1024 movdqu 0x20($inp), @XMM[2]
1025 movdqu 0x30($inp), @XMM[3]
1026 movdqu 0x40($inp), @XMM[4]
1027 movdqu 0x50($inp), @XMM[5]
1028 movdqu 0x60($inp), @XMM[6]
1029 movdqu 0x70($inp), @XMM[7]
1030 mov $key, %rax # pass the $key
1031 lea 0x80($inp), $inp
1034 call _bsaes_decrypt8
1036 movdqu @XMM[0], 0x00($out) # write output
1037 movdqu @XMM[1], 0x10($out)
1038 movdqu @XMM[6], 0x20($out)
1039 movdqu @XMM[4], 0x30($out)
1040 movdqu @XMM[2], 0x40($out)
1041 movdqu @XMM[7], 0x50($out)
1042 movdqu @XMM[3], 0x60($out)
1043 movdqu @XMM[5], 0x70($out)
1044 lea 0x80($out), $out
1048 .size bsaes_decrypt_128,.-bsaes_decrypt_128
1052 ######################################################################
1056 my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
1057 : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
1058 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
1062 .globl bsaes_ecb_encrypt_blocks
1063 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
1065 bsaes_ecb_encrypt_blocks:
1074 lea -0x48(%rsp),%rsp
1076 $code.=<<___ if ($win64);
1077 lea -0xa0(%rsp), %rsp
1078 movaps %xmm6, 0x40(%rsp)
1079 movaps %xmm7, 0x50(%rsp)
1080 movaps %xmm8, 0x60(%rsp)
1081 movaps %xmm9, 0x70(%rsp)
1082 movaps %xmm10, 0x80(%rsp)
1083 movaps %xmm11, 0x90(%rsp)
1084 movaps %xmm12, 0xa0(%rsp)
1085 movaps %xmm13, 0xb0(%rsp)
1086 movaps %xmm14, 0xc0(%rsp)
1087 movaps %xmm15, 0xd0(%rsp)
1091 mov %rsp,%rbp # backup %rsp
1092 mov 240($arg4),%eax # rounds
1093 mov $arg1,$inp # backup arguments
1100 mov %eax,%ebx # backup rounds
1101 shl \$7,%rax # 128 bytes per inner round key
1102 sub \$`128-32`,%rax # size of bit-sliced key schedule
1104 mov %rsp,%rax # pass key schedule
1105 mov $key,%rcx # pass key
1106 mov %ebx,%r10d # pass rounds
1107 call _bsaes_key_convert
1108 pxor %xmm6,%xmm7 # fix up last round key
1109 movdqa %xmm7,(%rax) # save last round key
1113 movdqu 0x00($inp), @XMM[0] # load input
1114 movdqu 0x10($inp), @XMM[1]
1115 movdqu 0x20($inp), @XMM[2]
1116 movdqu 0x30($inp), @XMM[3]
1117 movdqu 0x40($inp), @XMM[4]
1118 movdqu 0x50($inp), @XMM[5]
1119 mov %rsp, %rax # pass key schedule
1120 movdqu 0x60($inp), @XMM[6]
1121 mov %ebx,%r10d # pass rounds
1122 movdqu 0x70($inp), @XMM[7]
1123 lea 0x80($inp), $inp
1125 call _bsaes_encrypt8
1127 movdqu @XMM[0], 0x00($out) # write output
1128 movdqu @XMM[1], 0x10($out)
1129 movdqu @XMM[4], 0x20($out)
1130 movdqu @XMM[6], 0x30($out)
1131 movdqu @XMM[3], 0x40($out)
1132 movdqu @XMM[7], 0x50($out)
1133 movdqu @XMM[2], 0x60($out)
1134 movdqu @XMM[5], 0x70($out)
1135 lea 0x80($out), $out
1142 movdqu 0x00($inp), @XMM[0] # load input
1143 mov %rsp, %rax # pass key schedule
1144 mov %ebx,%r10d # pass rounds
1147 movdqu 0x10($inp), @XMM[1]
1149 movdqu 0x20($inp), @XMM[2]
1152 movdqu 0x30($inp), @XMM[3]
1154 movdqu 0x40($inp), @XMM[4]
1157 movdqu 0x50($inp), @XMM[5]
1159 movdqu 0x60($inp), @XMM[6]
1160 call _bsaes_encrypt8
1161 movdqu @XMM[0], 0x00($out) # write output
1162 movdqu @XMM[1], 0x10($out)
1163 movdqu @XMM[4], 0x20($out)
1164 movdqu @XMM[6], 0x30($out)
1165 movdqu @XMM[3], 0x40($out)
1166 movdqu @XMM[7], 0x50($out)
1167 movdqu @XMM[2], 0x60($out)
1171 call _bsaes_encrypt8
1172 movdqu @XMM[0], 0x00($out) # write output
1173 movdqu @XMM[1], 0x10($out)
1174 movdqu @XMM[4], 0x20($out)
1175 movdqu @XMM[6], 0x30($out)
1176 movdqu @XMM[3], 0x40($out)
1177 movdqu @XMM[7], 0x50($out)
1181 call _bsaes_encrypt8
1182 movdqu @XMM[0], 0x00($out) # write output
1183 movdqu @XMM[1], 0x10($out)
1184 movdqu @XMM[4], 0x20($out)
1185 movdqu @XMM[6], 0x30($out)
1186 movdqu @XMM[3], 0x40($out)
1190 call _bsaes_encrypt8
1191 movdqu @XMM[0], 0x00($out) # write output
1192 movdqu @XMM[1], 0x10($out)
1193 movdqu @XMM[4], 0x20($out)
1194 movdqu @XMM[6], 0x30($out)
1198 call _bsaes_encrypt8
1199 movdqu @XMM[0], 0x00($out) # write output
1200 movdqu @XMM[1], 0x10($out)
1201 movdqu @XMM[4], 0x20($out)
1205 call _bsaes_encrypt8
1206 movdqu @XMM[0], 0x00($out) # write output
1207 movdqu @XMM[1], 0x10($out)
1211 call _bsaes_encrypt8
1212 movdqu @XMM[0], 0x00($out) # write output
1219 call asm_AES_encrypt
1228 .Lecb_enc_bzero: # wipe key schedule [if any]
1229 movdqa %xmm0, 0x00(%rax)
1230 movdqa %xmm0, 0x10(%rax)
1231 lea 0x20(%rax), %rax
1235 lea (%rbp),%rsp # restore %rsp
1237 $code.=<<___ if ($win64);
1238 movaps 0x40(%rbp), %xmm6
1239 movaps 0x50(%rbp), %xmm7
1240 movaps 0x60(%rbp), %xmm8
1241 movaps 0x70(%rbp), %xmm9
1242 movaps 0x80(%rbp), %xmm10
1243 movaps 0x90(%rbp), %xmm11
1244 movaps 0xa0(%rbp), %xmm12
1245 movaps 0xb0(%rbp), %xmm13
1246 movaps 0xc0(%rbp), %xmm14
1247 movaps 0xd0(%rbp), %xmm15
1248 lea 0xa0(%rbp), %rsp
1251 mov 0x48(%rsp), %r15
1252 mov 0x50(%rsp), %r14
1253 mov 0x58(%rsp), %r13
1254 mov 0x60(%rsp), %r12
1255 mov 0x68(%rsp), %rbx
1256 mov 0x70(%rsp), %rax
1257 lea 0x78(%rsp), %rsp
1261 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
1263 .globl bsaes_ecb_decrypt_blocks
1264 .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
1266 bsaes_ecb_decrypt_blocks:
1275 lea -0x48(%rsp),%rsp
1277 $code.=<<___ if ($win64);
1278 lea -0xa0(%rsp), %rsp
1279 movaps %xmm6, 0x40(%rsp)
1280 movaps %xmm7, 0x50(%rsp)
1281 movaps %xmm8, 0x60(%rsp)
1282 movaps %xmm9, 0x70(%rsp)
1283 movaps %xmm10, 0x80(%rsp)
1284 movaps %xmm11, 0x90(%rsp)
1285 movaps %xmm12, 0xa0(%rsp)
1286 movaps %xmm13, 0xb0(%rsp)
1287 movaps %xmm14, 0xc0(%rsp)
1288 movaps %xmm15, 0xd0(%rsp)
1292 mov %rsp,%rbp # backup %rsp
1293 mov 240($arg4),%eax # rounds
1294 mov $arg1,$inp # backup arguments
1301 mov %eax,%ebx # backup rounds
1302 shl \$7,%rax # 128 bytes per inner round key
1303 sub \$`128-32`,%rax # size of bit-sliced key schedule
1305 mov %rsp,%rax # pass key schedule
1306 mov $key,%rcx # pass key
1307 mov %ebx,%r10d # pass rounds
1308 call _bsaes_key_convert
1309 pxor (%rsp),%xmm7 # fix up 0 round key
1310 movdqa %xmm6,(%rax) # save last round key
1315 movdqu 0x00($inp), @XMM[0] # load input
1316 movdqu 0x10($inp), @XMM[1]
1317 movdqu 0x20($inp), @XMM[2]
1318 movdqu 0x30($inp), @XMM[3]
1319 movdqu 0x40($inp), @XMM[4]
1320 movdqu 0x50($inp), @XMM[5]
1321 mov %rsp, %rax # pass key schedule
1322 movdqu 0x60($inp), @XMM[6]
1323 mov %ebx,%r10d # pass rounds
1324 movdqu 0x70($inp), @XMM[7]
1325 lea 0x80($inp), $inp
1327 call _bsaes_decrypt8
1329 movdqu @XMM[0], 0x00($out) # write output
1330 movdqu @XMM[1], 0x10($out)
1331 movdqu @XMM[6], 0x20($out)
1332 movdqu @XMM[4], 0x30($out)
1333 movdqu @XMM[2], 0x40($out)
1334 movdqu @XMM[7], 0x50($out)
1335 movdqu @XMM[3], 0x60($out)
1336 movdqu @XMM[5], 0x70($out)
1337 lea 0x80($out), $out
1344 movdqu 0x00($inp), @XMM[0] # load input
1345 mov %rsp, %rax # pass key schedule
1346 mov %ebx,%r10d # pass rounds
1349 movdqu 0x10($inp), @XMM[1]
1351 movdqu 0x20($inp), @XMM[2]
1354 movdqu 0x30($inp), @XMM[3]
1356 movdqu 0x40($inp), @XMM[4]
1359 movdqu 0x50($inp), @XMM[5]
1361 movdqu 0x60($inp), @XMM[6]
1362 call _bsaes_decrypt8
1363 movdqu @XMM[0], 0x00($out) # write output
1364 movdqu @XMM[1], 0x10($out)
1365 movdqu @XMM[6], 0x20($out)
1366 movdqu @XMM[4], 0x30($out)
1367 movdqu @XMM[2], 0x40($out)
1368 movdqu @XMM[7], 0x50($out)
1369 movdqu @XMM[3], 0x60($out)
1373 call _bsaes_decrypt8
1374 movdqu @XMM[0], 0x00($out) # write output
1375 movdqu @XMM[1], 0x10($out)
1376 movdqu @XMM[6], 0x20($out)
1377 movdqu @XMM[4], 0x30($out)
1378 movdqu @XMM[2], 0x40($out)
1379 movdqu @XMM[7], 0x50($out)
1383 call _bsaes_decrypt8
1384 movdqu @XMM[0], 0x00($out) # write output
1385 movdqu @XMM[1], 0x10($out)
1386 movdqu @XMM[6], 0x20($out)
1387 movdqu @XMM[4], 0x30($out)
1388 movdqu @XMM[2], 0x40($out)
1392 call _bsaes_decrypt8
1393 movdqu @XMM[0], 0x00($out) # write output
1394 movdqu @XMM[1], 0x10($out)
1395 movdqu @XMM[6], 0x20($out)
1396 movdqu @XMM[4], 0x30($out)
1400 call _bsaes_decrypt8
1401 movdqu @XMM[0], 0x00($out) # write output
1402 movdqu @XMM[1], 0x10($out)
1403 movdqu @XMM[6], 0x20($out)
1407 call _bsaes_decrypt8
1408 movdqu @XMM[0], 0x00($out) # write output
1409 movdqu @XMM[1], 0x10($out)
1413 call _bsaes_decrypt8
1414 movdqu @XMM[0], 0x00($out) # write output
1421 call asm_AES_decrypt
1430 .Lecb_dec_bzero: # wipe key schedule [if any]
1431 movdqa %xmm0, 0x00(%rax)
1432 movdqa %xmm0, 0x10(%rax)
1433 lea 0x20(%rax), %rax
1437 lea (%rbp),%rsp # restore %rsp
1439 $code.=<<___ if ($win64);
1440 movaps 0x40(%rbp), %xmm6
1441 movaps 0x50(%rbp), %xmm7
1442 movaps 0x60(%rbp), %xmm8
1443 movaps 0x70(%rbp), %xmm9
1444 movaps 0x80(%rbp), %xmm10
1445 movaps 0x90(%rbp), %xmm11
1446 movaps 0xa0(%rbp), %xmm12
1447 movaps 0xb0(%rbp), %xmm13
1448 movaps 0xc0(%rbp), %xmm14
1449 movaps 0xd0(%rbp), %xmm15
1450 lea 0xa0(%rbp), %rsp
1453 mov 0x48(%rsp), %r15
1454 mov 0x50(%rsp), %r14
1455 mov 0x58(%rsp), %r13
1456 mov 0x60(%rsp), %r12
1457 mov 0x68(%rsp), %rbx
1458 mov 0x70(%rsp), %rax
1459 lea 0x78(%rsp), %rsp
1463 .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
1467 .extern asm_AES_cbc_encrypt
1468 .globl bsaes_cbc_encrypt
1469 .type bsaes_cbc_encrypt,\@abi-omnipotent
1473 $code.=<<___ if ($win64);
1474 mov 48(%rsp),$arg6 # pull direction flag
1478 jne asm_AES_cbc_encrypt
1480 jb asm_AES_cbc_encrypt
1490 lea -0x48(%rsp), %rsp
1492 $code.=<<___ if ($win64);
1493 mov 0xa0(%rsp),$arg5 # pull ivp
1494 lea -0xa0(%rsp), %rsp
1495 movaps %xmm6, 0x40(%rsp)
1496 movaps %xmm7, 0x50(%rsp)
1497 movaps %xmm8, 0x60(%rsp)
1498 movaps %xmm9, 0x70(%rsp)
1499 movaps %xmm10, 0x80(%rsp)
1500 movaps %xmm11, 0x90(%rsp)
1501 movaps %xmm12, 0xa0(%rsp)
1502 movaps %xmm13, 0xb0(%rsp)
1503 movaps %xmm14, 0xc0(%rsp)
1504 movaps %xmm15, 0xd0(%rsp)
1508 mov %rsp, %rbp # backup %rsp
1509 mov 240($arg4), %eax # rounds
1510 mov $arg1, $inp # backup arguments
1515 shr \$4, $len # bytes to blocks
1517 mov %eax, %edx # rounds
1518 shl \$7, %rax # 128 bytes per inner round key
1519 sub \$`128-32`, %rax # size of bit-sliced key schedule
1522 mov %rsp, %rax # pass key schedule
1523 mov $key, %rcx # pass key
1524 mov %edx, %r10d # pass rounds
1525 call _bsaes_key_convert
1526 pxor (%rsp),%xmm7 # fix up 0 round key
1527 movdqa %xmm6,(%rax) # save last round key
1530 movdqu (%rbx), @XMM[15] # load IV
1533 movdqu 0x00($inp), @XMM[0] # load input
1534 movdqu 0x10($inp), @XMM[1]
1535 movdqu 0x20($inp), @XMM[2]
1536 movdqu 0x30($inp), @XMM[3]
1537 movdqu 0x40($inp), @XMM[4]
1538 movdqu 0x50($inp), @XMM[5]
1539 mov %rsp, %rax # pass key schedule
1540 movdqu 0x60($inp), @XMM[6]
1541 mov %edx,%r10d # pass rounds
1542 movdqu 0x70($inp), @XMM[7]
1543 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1545 call _bsaes_decrypt8
1547 pxor 0x20(%rbp), @XMM[0] # ^= IV
1548 movdqu 0x00($inp), @XMM[8] # re-load input
1549 movdqu 0x10($inp), @XMM[9]
1550 pxor @XMM[8], @XMM[1]
1551 movdqu 0x20($inp), @XMM[10]
1552 pxor @XMM[9], @XMM[6]
1553 movdqu 0x30($inp), @XMM[11]
1554 pxor @XMM[10], @XMM[4]
1555 movdqu 0x40($inp), @XMM[12]
1556 pxor @XMM[11], @XMM[2]
1557 movdqu 0x50($inp), @XMM[13]
1558 pxor @XMM[12], @XMM[7]
1559 movdqu 0x60($inp), @XMM[14]
1560 pxor @XMM[13], @XMM[3]
1561 movdqu 0x70($inp), @XMM[15] # IV
1562 pxor @XMM[14], @XMM[5]
1563 movdqu @XMM[0], 0x00($out) # write output
1564 lea 0x80($inp), $inp
1565 movdqu @XMM[1], 0x10($out)
1566 movdqu @XMM[6], 0x20($out)
1567 movdqu @XMM[4], 0x30($out)
1568 movdqu @XMM[2], 0x40($out)
1569 movdqu @XMM[7], 0x50($out)
1570 movdqu @XMM[3], 0x60($out)
1571 movdqu @XMM[5], 0x70($out)
1572 lea 0x80($out), $out
1579 movdqu 0x00($inp), @XMM[0] # load input
1580 mov %rsp, %rax # pass key schedule
1581 mov %edx, %r10d # pass rounds
1584 movdqu 0x10($inp), @XMM[1]
1586 movdqu 0x20($inp), @XMM[2]
1589 movdqu 0x30($inp), @XMM[3]
1591 movdqu 0x40($inp), @XMM[4]
1594 movdqu 0x50($inp), @XMM[5]
1596 movdqu 0x60($inp), @XMM[6]
1597 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1598 call _bsaes_decrypt8
1599 pxor 0x20(%rbp), @XMM[0] # ^= IV
1600 movdqu 0x00($inp), @XMM[8] # re-load input
1601 movdqu 0x10($inp), @XMM[9]
1602 pxor @XMM[8], @XMM[1]
1603 movdqu 0x20($inp), @XMM[10]
1604 pxor @XMM[9], @XMM[6]
1605 movdqu 0x30($inp), @XMM[11]
1606 pxor @XMM[10], @XMM[4]
1607 movdqu 0x40($inp), @XMM[12]
1608 pxor @XMM[11], @XMM[2]
1609 movdqu 0x50($inp), @XMM[13]
1610 pxor @XMM[12], @XMM[7]
1611 movdqu 0x60($inp), @XMM[15] # IV
1612 pxor @XMM[13], @XMM[3]
1613 movdqu @XMM[0], 0x00($out) # write output
1614 movdqu @XMM[1], 0x10($out)
1615 movdqu @XMM[6], 0x20($out)
1616 movdqu @XMM[4], 0x30($out)
1617 movdqu @XMM[2], 0x40($out)
1618 movdqu @XMM[7], 0x50($out)
1619 movdqu @XMM[3], 0x60($out)
1623 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1624 call _bsaes_decrypt8
1625 pxor 0x20(%rbp), @XMM[0] # ^= IV
1626 movdqu 0x00($inp), @XMM[8] # re-load input
1627 movdqu 0x10($inp), @XMM[9]
1628 pxor @XMM[8], @XMM[1]
1629 movdqu 0x20($inp), @XMM[10]
1630 pxor @XMM[9], @XMM[6]
1631 movdqu 0x30($inp), @XMM[11]
1632 pxor @XMM[10], @XMM[4]
1633 movdqu 0x40($inp), @XMM[12]
1634 pxor @XMM[11], @XMM[2]
1635 movdqu 0x50($inp), @XMM[15] # IV
1636 pxor @XMM[12], @XMM[7]
1637 movdqu @XMM[0], 0x00($out) # write output
1638 movdqu @XMM[1], 0x10($out)
1639 movdqu @XMM[6], 0x20($out)
1640 movdqu @XMM[4], 0x30($out)
1641 movdqu @XMM[2], 0x40($out)
1642 movdqu @XMM[7], 0x50($out)
1646 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1647 call _bsaes_decrypt8
1648 pxor 0x20(%rbp), @XMM[0] # ^= IV
1649 movdqu 0x00($inp), @XMM[8] # re-load input
1650 movdqu 0x10($inp), @XMM[9]
1651 pxor @XMM[8], @XMM[1]
1652 movdqu 0x20($inp), @XMM[10]
1653 pxor @XMM[9], @XMM[6]
1654 movdqu 0x30($inp), @XMM[11]
1655 pxor @XMM[10], @XMM[4]
1656 movdqu 0x40($inp), @XMM[15] # IV
1657 pxor @XMM[11], @XMM[2]
1658 movdqu @XMM[0], 0x00($out) # write output
1659 movdqu @XMM[1], 0x10($out)
1660 movdqu @XMM[6], 0x20($out)
1661 movdqu @XMM[4], 0x30($out)
1662 movdqu @XMM[2], 0x40($out)
1666 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1667 call _bsaes_decrypt8
1668 pxor 0x20(%rbp), @XMM[0] # ^= IV
1669 movdqu 0x00($inp), @XMM[8] # re-load input
1670 movdqu 0x10($inp), @XMM[9]
1671 pxor @XMM[8], @XMM[1]
1672 movdqu 0x20($inp), @XMM[10]
1673 pxor @XMM[9], @XMM[6]
1674 movdqu 0x30($inp), @XMM[15] # IV
1675 pxor @XMM[10], @XMM[4]
1676 movdqu @XMM[0], 0x00($out) # write output
1677 movdqu @XMM[1], 0x10($out)
1678 movdqu @XMM[6], 0x20($out)
1679 movdqu @XMM[4], 0x30($out)
1683 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1684 call _bsaes_decrypt8
1685 pxor 0x20(%rbp), @XMM[0] # ^= IV
1686 movdqu 0x00($inp), @XMM[8] # re-load input
1687 movdqu 0x10($inp), @XMM[9]
1688 pxor @XMM[8], @XMM[1]
1689 movdqu 0x20($inp), @XMM[15] # IV
1690 pxor @XMM[9], @XMM[6]
1691 movdqu @XMM[0], 0x00($out) # write output
1692 movdqu @XMM[1], 0x10($out)
1693 movdqu @XMM[6], 0x20($out)
1697 movdqa @XMM[15], 0x20(%rbp) # put aside IV
1698 call _bsaes_decrypt8
1699 pxor 0x20(%rbp), @XMM[0] # ^= IV
1700 movdqu 0x00($inp), @XMM[8] # re-load input
1701 movdqu 0x10($inp), @XMM[15] # IV
1702 pxor @XMM[8], @XMM[1]
1703 movdqu @XMM[0], 0x00($out) # write output
1704 movdqu @XMM[1], 0x10($out)
1709 lea 0x20(%rbp), $arg2 # buffer output
1711 call asm_AES_decrypt # doesn't touch %xmm
1712 pxor 0x20(%rbp), @XMM[15] # ^= IV
1713 movdqu @XMM[15], ($out) # write output
1714 movdqa @XMM[0], @XMM[15] # IV
1717 movdqu @XMM[15], (%rbx) # return IV
1720 .Lcbc_dec_bzero: # wipe key schedule [if any]
1721 movdqa %xmm0, 0x00(%rax)
1722 movdqa %xmm0, 0x10(%rax)
1723 lea 0x20(%rax), %rax
1727 lea (%rbp),%rsp # restore %rsp
1729 $code.=<<___ if ($win64);
1730 movaps 0x40(%rbp), %xmm6
1731 movaps 0x50(%rbp), %xmm7
1732 movaps 0x60(%rbp), %xmm8
1733 movaps 0x70(%rbp), %xmm9
1734 movaps 0x80(%rbp), %xmm10
1735 movaps 0x90(%rbp), %xmm11
1736 movaps 0xa0(%rbp), %xmm12
1737 movaps 0xb0(%rbp), %xmm13
1738 movaps 0xc0(%rbp), %xmm14
1739 movaps 0xd0(%rbp), %xmm15
1740 lea 0xa0(%rbp), %rsp
1743 mov 0x48(%rsp), %r15
1744 mov 0x50(%rsp), %r14
1745 mov 0x58(%rsp), %r13
1746 mov 0x60(%rsp), %r12
1747 mov 0x68(%rsp), %rbx
1748 mov 0x70(%rsp), %rax
1749 lea 0x78(%rsp), %rsp
1753 .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
1755 .globl bsaes_ctr32_encrypt_blocks
1756 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
1758 bsaes_ctr32_encrypt_blocks:
1767 lea -0x48(%rsp), %rsp
1769 $code.=<<___ if ($win64);
1770 mov 0xa0(%rsp),$arg5 # pull ivp
1771 lea -0xa0(%rsp), %rsp
1772 movaps %xmm6, 0x40(%rsp)
1773 movaps %xmm7, 0x50(%rsp)
1774 movaps %xmm8, 0x60(%rsp)
1775 movaps %xmm9, 0x70(%rsp)
1776 movaps %xmm10, 0x80(%rsp)
1777 movaps %xmm11, 0x90(%rsp)
1778 movaps %xmm12, 0xa0(%rsp)
1779 movaps %xmm13, 0xb0(%rsp)
1780 movaps %xmm14, 0xc0(%rsp)
1781 movaps %xmm15, 0xd0(%rsp)
1785 mov %rsp, %rbp # backup %rsp
1786 movdqu ($arg5), %xmm0 # load counter
1787 mov 240($arg4), %eax # rounds
1788 mov $arg1, $inp # backup arguments
1792 movdqa %xmm0, 0x20(%rbp) # copy counter
1796 mov %eax, %ebx # rounds
1797 shl \$7, %rax # 128 bytes per inner round key
1798 sub \$`128-32`, %rax # size of bit-sliced key schedule
1801 mov %rsp, %rax # pass key schedule
1802 mov $key, %rcx # pass key
1803 mov %ebx, %r10d # pass rounds
1804 call _bsaes_key_convert
1805 pxor %xmm6,%xmm7 # fix up last round key
1806 movdqa %xmm7,(%rax) # save last round key
1808 movdqa (%rsp), @XMM[9] # load round0 key
1809 lea .LADD1(%rip), %r11
1810 movdqa 0x20(%rbp), @XMM[0] # counter copy
1811 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
1812 pshufb @XMM[8], @XMM[9] # byte swap upper part
1813 pshufb @XMM[8], @XMM[0]
1814 movdqa @XMM[9], (%rsp) # save adjusted round0 key
1818 movdqa @XMM[0], 0x20(%rbp) # save counter
1819 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
1820 movdqa @XMM[0], @XMM[2]
1821 paddd 0x00(%r11), @XMM[1] # .LADD1
1822 movdqa @XMM[0], @XMM[3]
1823 paddd 0x10(%r11), @XMM[2] # .LADD2
1824 movdqa @XMM[0], @XMM[4]
1825 paddd 0x20(%r11), @XMM[3] # .LADD3
1826 movdqa @XMM[0], @XMM[5]
1827 paddd 0x30(%r11), @XMM[4] # .LADD4
1828 movdqa @XMM[0], @XMM[6]
1829 paddd 0x40(%r11), @XMM[5] # .LADD5
1830 movdqa @XMM[0], @XMM[7]
1831 paddd 0x50(%r11), @XMM[6] # .LADD6
1832 paddd 0x60(%r11), @XMM[7] # .LADD7
1834 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1835 # to flip byte order in 32-bit counter
1836 movdqa (%rsp), @XMM[9] # round 0 key
1837 lea 0x10(%rsp), %rax # pass key schedule
1838 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1839 pxor @XMM[9], @XMM[0] # xor with round0 key
1840 pxor @XMM[9], @XMM[1]
1841 pshufb @XMM[8], @XMM[0]
1842 pxor @XMM[9], @XMM[2]
1843 pshufb @XMM[8], @XMM[1]
1844 pxor @XMM[9], @XMM[3]
1845 pshufb @XMM[8], @XMM[2]
1846 pxor @XMM[9], @XMM[4]
1847 pshufb @XMM[8], @XMM[3]
1848 pxor @XMM[9], @XMM[5]
1849 pshufb @XMM[8], @XMM[4]
1850 pxor @XMM[9], @XMM[6]
1851 pshufb @XMM[8], @XMM[5]
1852 pxor @XMM[9], @XMM[7]
1853 pshufb @XMM[8], @XMM[6]
1854 lea .LBS0(%rip), %r11 # constants table
1855 pshufb @XMM[8], @XMM[7]
1856 mov %ebx,%r10d # pass rounds
1858 call _bsaes_encrypt8_bitslice
1861 jc .Lctr_enc_loop_done
1863 movdqu 0x00($inp), @XMM[8] # load input
1864 movdqu 0x10($inp), @XMM[9]
1865 movdqu 0x20($inp), @XMM[10]
1866 movdqu 0x30($inp), @XMM[11]
1867 movdqu 0x40($inp), @XMM[12]
1868 movdqu 0x50($inp), @XMM[13]
1869 movdqu 0x60($inp), @XMM[14]
1870 movdqu 0x70($inp), @XMM[15]
1872 pxor @XMM[0], @XMM[8]
1873 movdqa 0x20(%rbp), @XMM[0] # load counter
1874 pxor @XMM[9], @XMM[1]
1875 movdqu @XMM[8], 0x00($out) # write output
1876 pxor @XMM[10], @XMM[4]
1877 movdqu @XMM[1], 0x10($out)
1878 pxor @XMM[11], @XMM[6]
1879 movdqu @XMM[4], 0x20($out)
1880 pxor @XMM[12], @XMM[3]
1881 movdqu @XMM[6], 0x30($out)
1882 pxor @XMM[13], @XMM[7]
1883 movdqu @XMM[3], 0x40($out)
1884 pxor @XMM[14], @XMM[2]
1885 movdqu @XMM[7], 0x50($out)
1886 pxor @XMM[15], @XMM[5]
1887 movdqu @XMM[2], 0x60($out)
1888 lea .LADD1(%rip), %r11
1889 movdqu @XMM[5], 0x70($out)
1890 lea 0x80($out), $out
1891 paddd 0x70(%r11), @XMM[0] # .LADD8
1896 .Lctr_enc_loop_done:
1898 movdqu 0x00($inp), @XMM[8] # load input
1899 pxor @XMM[8], @XMM[0]
1900 movdqu @XMM[0], 0x00($out) # write output
1903 movdqu 0x10($inp), @XMM[9]
1904 pxor @XMM[9], @XMM[1]
1905 movdqu @XMM[1], 0x10($out)
1907 movdqu 0x20($inp), @XMM[10]
1908 pxor @XMM[10], @XMM[4]
1909 movdqu @XMM[4], 0x20($out)
1912 movdqu 0x30($inp), @XMM[11]
1913 pxor @XMM[11], @XMM[6]
1914 movdqu @XMM[6], 0x30($out)
1916 movdqu 0x40($inp), @XMM[12]
1917 pxor @XMM[12], @XMM[3]
1918 movdqu @XMM[3], 0x40($out)
1921 movdqu 0x50($inp), @XMM[13]
1922 pxor @XMM[13], @XMM[7]
1923 movdqu @XMM[7], 0x50($out)
1925 movdqu 0x60($inp), @XMM[14]
1926 pxor @XMM[14], @XMM[2]
1927 movdqu @XMM[2], 0x60($out)
1932 lea 0x20(%rbp), $arg1
1933 lea 0x30(%rbp), $arg2
1935 call asm_AES_encrypt
1936 movdqu ($inp), @XMM[1]
1938 mov 0x2c(%rbp), %eax # load 32-bit counter
1940 pxor 0x30(%rbp), @XMM[1]
1941 inc %eax # increment
1942 movdqu @XMM[1], ($out)
1945 mov %eax, 0x2c(%rsp) # save 32-bit counter
1952 .Lctr_enc_bzero: # wipe key schedule [if any]
1953 movdqa %xmm0, 0x00(%rax)
1954 movdqa %xmm0, 0x10(%rax)
1955 lea 0x20(%rax), %rax
1959 lea (%rbp),%rsp # restore %rsp
1961 $code.=<<___ if ($win64);
1962 movaps 0x40(%rbp), %xmm6
1963 movaps 0x50(%rbp), %xmm7
1964 movaps 0x60(%rbp), %xmm8
1965 movaps 0x70(%rbp), %xmm9
1966 movaps 0x80(%rbp), %xmm10
1967 movaps 0x90(%rbp), %xmm11
1968 movaps 0xa0(%rbp), %xmm12
1969 movaps 0xb0(%rbp), %xmm13
1970 movaps 0xc0(%rbp), %xmm14
1971 movaps 0xd0(%rbp), %xmm15
1972 lea 0xa0(%rbp), %rsp
1975 mov 0x48(%rsp), %r15
1976 mov 0x50(%rsp), %r14
1977 mov 0x58(%rsp), %r13
1978 mov 0x60(%rsp), %r12
1979 mov 0x68(%rsp), %rbx
1980 mov 0x70(%rsp), %rax
1981 lea 0x78(%rsp), %rsp
1985 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1987 ######################################################################
1988 # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1989 # const AES_KEY *key1, const AES_KEY *key2,
1990 # const unsigned char iv[16]);
1992 my ($twmask,$twres,$twtmp)=@XMM[13..15];
1994 .globl bsaes_xts_encrypt
1995 .type bsaes_xts_encrypt,\@abi-omnipotent
2006 lea -0x48(%rsp), %rsp
2008 $code.=<<___ if ($win64);
2009 mov 0xa0(%rsp),$arg5 # pull key2
2010 mov 0xa8(%rsp),$arg6 # pull ivp
2011 lea -0xa0(%rsp), %rsp
2012 movaps %xmm6, 0x40(%rsp)
2013 movaps %xmm7, 0x50(%rsp)
2014 movaps %xmm8, 0x60(%rsp)
2015 movaps %xmm9, 0x70(%rsp)
2016 movaps %xmm10, 0x80(%rsp)
2017 movaps %xmm11, 0x90(%rsp)
2018 movaps %xmm12, 0xa0(%rsp)
2019 movaps %xmm13, 0xb0(%rsp)
2020 movaps %xmm14, 0xc0(%rsp)
2021 movaps %xmm15, 0xd0(%rsp)
2025 mov %rsp, %rbp # backup %rsp
2026 mov $arg1, $inp # backup arguments
2032 lea 0x20(%rbp), $arg2
2034 call asm_AES_encrypt # generate initial tweak
2036 mov 240($key), %eax # rounds
2037 mov $len, %rbx # backup $len
2039 mov %eax, %edx # rounds
2040 shl \$7, %rax # 128 bytes per inner round key
2041 sub \$`128-32`, %rax # size of bit-sliced key schedule
2044 mov %rsp, %rax # pass key schedule
2045 mov $key, %rcx # pass key
2046 mov %edx, %r10d # pass rounds
2047 call _bsaes_key_convert
2048 pxor %xmm6, %xmm7 # fix up last round key
2049 movdqa %xmm7, (%rax) # save last round key
2052 sub \$0x80, %rsp # place for tweak[8]
2053 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2056 movdqa .Lxts_magic(%rip), $twmask
2057 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2066 for ($i=0;$i<7;$i++) {
2068 pshufd \$0x13, $twtmp, $twres
2070 movdqa @XMM[7], @XMM[$i]
2071 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2072 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2073 pand $twmask, $twres # isolate carry and residue
2074 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2075 pxor $twres, @XMM[7]
2077 $code.=<<___ if ($i>=1);
2078 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2080 $code.=<<___ if ($i>=2);
2081 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2085 movdqu 0x60($inp), @XMM[8+6]
2086 pxor @XMM[8+5], @XMM[5]
2087 movdqu 0x70($inp), @XMM[8+7]
2088 lea 0x80($inp), $inp
2089 movdqa @XMM[7], 0x70(%rsp)
2090 pxor @XMM[8+6], @XMM[6]
2091 lea 0x80(%rsp), %rax # pass key schedule
2092 pxor @XMM[8+7], @XMM[7]
2093 mov %edx, %r10d # pass rounds
2095 call _bsaes_encrypt8
2097 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2098 pxor 0x10(%rsp), @XMM[1]
2099 movdqu @XMM[0], 0x00($out) # write output
2100 pxor 0x20(%rsp), @XMM[4]
2101 movdqu @XMM[1], 0x10($out)
2102 pxor 0x30(%rsp), @XMM[6]
2103 movdqu @XMM[4], 0x20($out)
2104 pxor 0x40(%rsp), @XMM[3]
2105 movdqu @XMM[6], 0x30($out)
2106 pxor 0x50(%rsp), @XMM[7]
2107 movdqu @XMM[3], 0x40($out)
2108 pxor 0x60(%rsp), @XMM[2]
2109 movdqu @XMM[7], 0x50($out)
2110 pxor 0x70(%rsp), @XMM[5]
2111 movdqu @XMM[2], 0x60($out)
2112 movdqu @XMM[5], 0x70($out)
2113 lea 0x80($out), $out
2115 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2117 movdqa .Lxts_magic(%rip), $twmask
2118 pcmpgtd @XMM[7], $twtmp
2119 pshufd \$0x13, $twtmp, $twres
2121 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2122 pand $twmask, $twres # isolate carry and residue
2123 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2124 pxor $twres, @XMM[7]
2133 for ($i=0;$i<7;$i++) {
2135 pshufd \$0x13, $twtmp, $twres
2137 movdqa @XMM[7], @XMM[$i]
2138 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2139 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2140 pand $twmask, $twres # isolate carry and residue
2141 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2142 pxor $twres, @XMM[7]
2144 $code.=<<___ if ($i>=1);
2145 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2146 cmp \$`0x10*$i`,$len
2149 $code.=<<___ if ($i>=2);
2150 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2154 movdqu 0x60($inp), @XMM[8+6]
2155 pxor @XMM[8+5], @XMM[5]
2156 movdqa @XMM[7], 0x70(%rsp)
2157 lea 0x70($inp), $inp
2158 pxor @XMM[8+6], @XMM[6]
2159 lea 0x80(%rsp), %rax # pass key schedule
2160 mov %edx, %r10d # pass rounds
2162 call _bsaes_encrypt8
2164 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2165 pxor 0x10(%rsp), @XMM[1]
2166 movdqu @XMM[0], 0x00($out) # write output
2167 pxor 0x20(%rsp), @XMM[4]
2168 movdqu @XMM[1], 0x10($out)
2169 pxor 0x30(%rsp), @XMM[6]
2170 movdqu @XMM[4], 0x20($out)
2171 pxor 0x40(%rsp), @XMM[3]
2172 movdqu @XMM[6], 0x30($out)
2173 pxor 0x50(%rsp), @XMM[7]
2174 movdqu @XMM[3], 0x40($out)
2175 pxor 0x60(%rsp), @XMM[2]
2176 movdqu @XMM[7], 0x50($out)
2177 movdqu @XMM[2], 0x60($out)
2178 lea 0x70($out), $out
2180 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2184 pxor @XMM[8+4], @XMM[4]
2185 lea 0x60($inp), $inp
2186 pxor @XMM[8+5], @XMM[5]
2187 lea 0x80(%rsp), %rax # pass key schedule
2188 mov %edx, %r10d # pass rounds
2190 call _bsaes_encrypt8
2192 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2193 pxor 0x10(%rsp), @XMM[1]
2194 movdqu @XMM[0], 0x00($out) # write output
2195 pxor 0x20(%rsp), @XMM[4]
2196 movdqu @XMM[1], 0x10($out)
2197 pxor 0x30(%rsp), @XMM[6]
2198 movdqu @XMM[4], 0x20($out)
2199 pxor 0x40(%rsp), @XMM[3]
2200 movdqu @XMM[6], 0x30($out)
2201 pxor 0x50(%rsp), @XMM[7]
2202 movdqu @XMM[3], 0x40($out)
2203 movdqu @XMM[7], 0x50($out)
2204 lea 0x60($out), $out
2206 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2210 pxor @XMM[8+3], @XMM[3]
2211 lea 0x50($inp), $inp
2212 pxor @XMM[8+4], @XMM[4]
2213 lea 0x80(%rsp), %rax # pass key schedule
2214 mov %edx, %r10d # pass rounds
2216 call _bsaes_encrypt8
2218 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2219 pxor 0x10(%rsp), @XMM[1]
2220 movdqu @XMM[0], 0x00($out) # write output
2221 pxor 0x20(%rsp), @XMM[4]
2222 movdqu @XMM[1], 0x10($out)
2223 pxor 0x30(%rsp), @XMM[6]
2224 movdqu @XMM[4], 0x20($out)
2225 pxor 0x40(%rsp), @XMM[3]
2226 movdqu @XMM[6], 0x30($out)
2227 movdqu @XMM[3], 0x40($out)
2228 lea 0x50($out), $out
2230 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2234 pxor @XMM[8+2], @XMM[2]
2235 lea 0x40($inp), $inp
2236 pxor @XMM[8+3], @XMM[3]
2237 lea 0x80(%rsp), %rax # pass key schedule
2238 mov %edx, %r10d # pass rounds
2240 call _bsaes_encrypt8
2242 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2243 pxor 0x10(%rsp), @XMM[1]
2244 movdqu @XMM[0], 0x00($out) # write output
2245 pxor 0x20(%rsp), @XMM[4]
2246 movdqu @XMM[1], 0x10($out)
2247 pxor 0x30(%rsp), @XMM[6]
2248 movdqu @XMM[4], 0x20($out)
2249 movdqu @XMM[6], 0x30($out)
2250 lea 0x40($out), $out
2252 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2256 pxor @XMM[8+1], @XMM[1]
2257 lea 0x30($inp), $inp
2258 pxor @XMM[8+2], @XMM[2]
2259 lea 0x80(%rsp), %rax # pass key schedule
2260 mov %edx, %r10d # pass rounds
2262 call _bsaes_encrypt8
2264 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2265 pxor 0x10(%rsp), @XMM[1]
2266 movdqu @XMM[0], 0x00($out) # write output
2267 pxor 0x20(%rsp), @XMM[4]
2268 movdqu @XMM[1], 0x10($out)
2269 movdqu @XMM[4], 0x20($out)
2270 lea 0x30($out), $out
2272 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2276 pxor @XMM[8+0], @XMM[0]
2277 lea 0x20($inp), $inp
2278 pxor @XMM[8+1], @XMM[1]
2279 lea 0x80(%rsp), %rax # pass key schedule
2280 mov %edx, %r10d # pass rounds
2282 call _bsaes_encrypt8
2284 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2285 pxor 0x10(%rsp), @XMM[1]
2286 movdqu @XMM[0], 0x00($out) # write output
2287 movdqu @XMM[1], 0x10($out)
2288 lea 0x20($out), $out
2290 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2294 pxor @XMM[0], @XMM[8]
2295 lea 0x10($inp), $inp
2296 movdqa @XMM[8], 0x20(%rbp)
2297 lea 0x20(%rbp), $arg1
2298 lea 0x20(%rbp), $arg2
2300 call asm_AES_encrypt # doesn't touch %xmm
2301 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2302 #pxor @XMM[8], @XMM[0]
2303 #lea 0x80(%rsp), %rax # pass key schedule
2304 #mov %edx, %r10d # pass rounds
2305 #call _bsaes_encrypt8
2306 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2307 movdqu @XMM[0], 0x00($out) # write output
2308 lea 0x10($out), $out
2310 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2319 movzb -16(%rdx), %ecx
2327 movdqu -16($out), @XMM[0]
2328 lea 0x20(%rbp), $arg1
2329 pxor @XMM[7], @XMM[0]
2330 lea 0x20(%rbp), $arg2
2331 movdqa @XMM[0], 0x20(%rbp)
2333 call asm_AES_encrypt # doesn't touch %xmm
2334 pxor 0x20(%rbp), @XMM[7]
2335 movdqu @XMM[7], -16($out)
2340 .Lxts_enc_bzero: # wipe key schedule [if any]
2341 movdqa %xmm0, 0x00(%rax)
2342 movdqa %xmm0, 0x10(%rax)
2343 lea 0x20(%rax), %rax
2347 lea (%rbp),%rsp # restore %rsp
2349 $code.=<<___ if ($win64);
2350 movaps 0x40(%rbp), %xmm6
2351 movaps 0x50(%rbp), %xmm7
2352 movaps 0x60(%rbp), %xmm8
2353 movaps 0x70(%rbp), %xmm9
2354 movaps 0x80(%rbp), %xmm10
2355 movaps 0x90(%rbp), %xmm11
2356 movaps 0xa0(%rbp), %xmm12
2357 movaps 0xb0(%rbp), %xmm13
2358 movaps 0xc0(%rbp), %xmm14
2359 movaps 0xd0(%rbp), %xmm15
2360 lea 0xa0(%rbp), %rsp
2363 mov 0x48(%rsp), %r15
2364 mov 0x50(%rsp), %r14
2365 mov 0x58(%rsp), %r13
2366 mov 0x60(%rsp), %r12
2367 mov 0x68(%rsp), %rbx
2368 mov 0x70(%rsp), %rax
2369 lea 0x78(%rsp), %rsp
2373 .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
2375 .globl bsaes_xts_decrypt
2376 .type bsaes_xts_decrypt,\@abi-omnipotent
2387 lea -0x48(%rsp), %rsp
2389 $code.=<<___ if ($win64);
2390 mov 0xa0(%rsp),$arg5 # pull key2
2391 mov 0xa8(%rsp),$arg6 # pull ivp
2392 lea -0xa0(%rsp), %rsp
2393 movaps %xmm6, 0x40(%rsp)
2394 movaps %xmm7, 0x50(%rsp)
2395 movaps %xmm8, 0x60(%rsp)
2396 movaps %xmm9, 0x70(%rsp)
2397 movaps %xmm10, 0x80(%rsp)
2398 movaps %xmm11, 0x90(%rsp)
2399 movaps %xmm12, 0xa0(%rsp)
2400 movaps %xmm13, 0xb0(%rsp)
2401 movaps %xmm14, 0xc0(%rsp)
2402 movaps %xmm15, 0xd0(%rsp)
2406 mov %rsp, %rbp # backup %rsp
2407 mov $arg1, $inp # backup arguments
2413 lea 0x20(%rbp), $arg2
2415 call asm_AES_encrypt # generate initial tweak
2417 mov 240($key), %eax # rounds
2418 mov $len, %rbx # backup $len
2420 mov %eax, %edx # rounds
2421 shl \$7, %rax # 128 bytes per inner round key
2422 sub \$`128-32`, %rax # size of bit-sliced key schedule
2425 mov %rsp, %rax # pass key schedule
2426 mov $key, %rcx # pass key
2427 mov %edx, %r10d # pass rounds
2428 call _bsaes_key_convert
2429 pxor (%rsp), %xmm7 # fix up round 0 key
2430 movdqa %xmm6, (%rax) # save last round key
2431 movdqa %xmm7, (%rsp)
2433 xor %eax, %eax # if ($len%16) len-=16;
2440 sub \$0x80, %rsp # place for tweak[8]
2441 movdqa 0x20(%rbp), @XMM[7] # initial tweak
2444 movdqa .Lxts_magic(%rip), $twmask
2445 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2454 for ($i=0;$i<7;$i++) {
2456 pshufd \$0x13, $twtmp, $twres
2458 movdqa @XMM[7], @XMM[$i]
2459 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2460 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2461 pand $twmask, $twres # isolate carry and residue
2462 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2463 pxor $twres, @XMM[7]
2465 $code.=<<___ if ($i>=1);
2466 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2468 $code.=<<___ if ($i>=2);
2469 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2473 movdqu 0x60($inp), @XMM[8+6]
2474 pxor @XMM[8+5], @XMM[5]
2475 movdqu 0x70($inp), @XMM[8+7]
2476 lea 0x80($inp), $inp
2477 movdqa @XMM[7], 0x70(%rsp)
2478 pxor @XMM[8+6], @XMM[6]
2479 lea 0x80(%rsp), %rax # pass key schedule
2480 pxor @XMM[8+7], @XMM[7]
2481 mov %edx, %r10d # pass rounds
2483 call _bsaes_decrypt8
2485 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2486 pxor 0x10(%rsp), @XMM[1]
2487 movdqu @XMM[0], 0x00($out) # write output
2488 pxor 0x20(%rsp), @XMM[6]
2489 movdqu @XMM[1], 0x10($out)
2490 pxor 0x30(%rsp), @XMM[4]
2491 movdqu @XMM[6], 0x20($out)
2492 pxor 0x40(%rsp), @XMM[2]
2493 movdqu @XMM[4], 0x30($out)
2494 pxor 0x50(%rsp), @XMM[7]
2495 movdqu @XMM[2], 0x40($out)
2496 pxor 0x60(%rsp), @XMM[3]
2497 movdqu @XMM[7], 0x50($out)
2498 pxor 0x70(%rsp), @XMM[5]
2499 movdqu @XMM[3], 0x60($out)
2500 movdqu @XMM[5], 0x70($out)
2501 lea 0x80($out), $out
2503 movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
2505 movdqa .Lxts_magic(%rip), $twmask
2506 pcmpgtd @XMM[7], $twtmp
2507 pshufd \$0x13, $twtmp, $twres
2509 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2510 pand $twmask, $twres # isolate carry and residue
2511 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2512 pxor $twres, @XMM[7]
2521 for ($i=0;$i<7;$i++) {
2523 pshufd \$0x13, $twtmp, $twres
2525 movdqa @XMM[7], @XMM[$i]
2526 movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
2527 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2528 pand $twmask, $twres # isolate carry and residue
2529 pcmpgtd @XMM[7], $twtmp # broadcast upper bits
2530 pxor $twres, @XMM[7]
2532 $code.=<<___ if ($i>=1);
2533 movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
2534 cmp \$`0x10*$i`,$len
2537 $code.=<<___ if ($i>=2);
2538 pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
2542 movdqu 0x60($inp), @XMM[8+6]
2543 pxor @XMM[8+5], @XMM[5]
2544 movdqa @XMM[7], 0x70(%rsp)
2545 lea 0x70($inp), $inp
2546 pxor @XMM[8+6], @XMM[6]
2547 lea 0x80(%rsp), %rax # pass key schedule
2548 mov %edx, %r10d # pass rounds
2550 call _bsaes_decrypt8
2552 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2553 pxor 0x10(%rsp), @XMM[1]
2554 movdqu @XMM[0], 0x00($out) # write output
2555 pxor 0x20(%rsp), @XMM[6]
2556 movdqu @XMM[1], 0x10($out)
2557 pxor 0x30(%rsp), @XMM[4]
2558 movdqu @XMM[6], 0x20($out)
2559 pxor 0x40(%rsp), @XMM[2]
2560 movdqu @XMM[4], 0x30($out)
2561 pxor 0x50(%rsp), @XMM[7]
2562 movdqu @XMM[2], 0x40($out)
2563 pxor 0x60(%rsp), @XMM[3]
2564 movdqu @XMM[7], 0x50($out)
2565 movdqu @XMM[3], 0x60($out)
2566 lea 0x70($out), $out
2568 movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
2572 pxor @XMM[8+4], @XMM[4]
2573 lea 0x60($inp), $inp
2574 pxor @XMM[8+5], @XMM[5]
2575 lea 0x80(%rsp), %rax # pass key schedule
2576 mov %edx, %r10d # pass rounds
2578 call _bsaes_decrypt8
2580 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2581 pxor 0x10(%rsp), @XMM[1]
2582 movdqu @XMM[0], 0x00($out) # write output
2583 pxor 0x20(%rsp), @XMM[6]
2584 movdqu @XMM[1], 0x10($out)
2585 pxor 0x30(%rsp), @XMM[4]
2586 movdqu @XMM[6], 0x20($out)
2587 pxor 0x40(%rsp), @XMM[2]
2588 movdqu @XMM[4], 0x30($out)
2589 pxor 0x50(%rsp), @XMM[7]
2590 movdqu @XMM[2], 0x40($out)
2591 movdqu @XMM[7], 0x50($out)
2592 lea 0x60($out), $out
2594 movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
2598 pxor @XMM[8+3], @XMM[3]
2599 lea 0x50($inp), $inp
2600 pxor @XMM[8+4], @XMM[4]
2601 lea 0x80(%rsp), %rax # pass key schedule
2602 mov %edx, %r10d # pass rounds
2604 call _bsaes_decrypt8
2606 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2607 pxor 0x10(%rsp), @XMM[1]
2608 movdqu @XMM[0], 0x00($out) # write output
2609 pxor 0x20(%rsp), @XMM[6]
2610 movdqu @XMM[1], 0x10($out)
2611 pxor 0x30(%rsp), @XMM[4]
2612 movdqu @XMM[6], 0x20($out)
2613 pxor 0x40(%rsp), @XMM[2]
2614 movdqu @XMM[4], 0x30($out)
2615 movdqu @XMM[2], 0x40($out)
2616 lea 0x50($out), $out
2618 movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
2622 pxor @XMM[8+2], @XMM[2]
2623 lea 0x40($inp), $inp
2624 pxor @XMM[8+3], @XMM[3]
2625 lea 0x80(%rsp), %rax # pass key schedule
2626 mov %edx, %r10d # pass rounds
2628 call _bsaes_decrypt8
2630 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2631 pxor 0x10(%rsp), @XMM[1]
2632 movdqu @XMM[0], 0x00($out) # write output
2633 pxor 0x20(%rsp), @XMM[6]
2634 movdqu @XMM[1], 0x10($out)
2635 pxor 0x30(%rsp), @XMM[4]
2636 movdqu @XMM[6], 0x20($out)
2637 movdqu @XMM[4], 0x30($out)
2638 lea 0x40($out), $out
2640 movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
2644 pxor @XMM[8+1], @XMM[1]
2645 lea 0x30($inp), $inp
2646 pxor @XMM[8+2], @XMM[2]
2647 lea 0x80(%rsp), %rax # pass key schedule
2648 mov %edx, %r10d # pass rounds
2650 call _bsaes_decrypt8
2652 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2653 pxor 0x10(%rsp), @XMM[1]
2654 movdqu @XMM[0], 0x00($out) # write output
2655 pxor 0x20(%rsp), @XMM[6]
2656 movdqu @XMM[1], 0x10($out)
2657 movdqu @XMM[6], 0x20($out)
2658 lea 0x30($out), $out
2660 movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
2664 pxor @XMM[8+0], @XMM[0]
2665 lea 0x20($inp), $inp
2666 pxor @XMM[8+1], @XMM[1]
2667 lea 0x80(%rsp), %rax # pass key schedule
2668 mov %edx, %r10d # pass rounds
2670 call _bsaes_decrypt8
2672 pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2673 pxor 0x10(%rsp), @XMM[1]
2674 movdqu @XMM[0], 0x00($out) # write output
2675 movdqu @XMM[1], 0x10($out)
2676 lea 0x20($out), $out
2678 movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
2682 pxor @XMM[0], @XMM[8]
2683 lea 0x10($inp), $inp
2684 movdqa @XMM[8], 0x20(%rbp)
2685 lea 0x20(%rbp), $arg1
2686 lea 0x20(%rbp), $arg2
2688 call asm_AES_decrypt # doesn't touch %xmm
2689 pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
2690 #pxor @XMM[8], @XMM[0]
2691 #lea 0x80(%rsp), %rax # pass key schedule
2692 #mov %edx, %r10d # pass rounds
2693 #call _bsaes_decrypt8
2694 #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
2695 movdqu @XMM[0], 0x00($out) # write output
2696 lea 0x10($out), $out
2698 movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
2705 movdqa .Lxts_magic(%rip), $twmask
2706 pcmpgtd @XMM[7], $twtmp
2707 pshufd \$0x13, $twtmp, $twres
2708 movdqa @XMM[7], @XMM[6]
2709 paddq @XMM[7], @XMM[7] # psllq 1,$tweak
2710 pand $twmask, $twres # isolate carry and residue
2711 movdqu ($inp), @XMM[0]
2712 pxor $twres, @XMM[7]
2714 lea 0x20(%rbp), $arg1
2715 pxor @XMM[7], @XMM[0]
2716 lea 0x20(%rbp), $arg2
2717 movdqa @XMM[0], 0x20(%rbp)
2719 call asm_AES_decrypt # doesn't touch %xmm
2720 pxor 0x20(%rbp), @XMM[7]
2722 movdqu @XMM[7], ($out)
2725 movzb 16($inp), %eax
2734 movdqu ($out), @XMM[0]
2735 lea 0x20(%rbp), $arg1
2736 pxor @XMM[6], @XMM[0]
2737 lea 0x20(%rbp), $arg2
2738 movdqa @XMM[0], 0x20(%rbp)
2740 call asm_AES_decrypt # doesn't touch %xmm
2741 pxor 0x20(%rbp), @XMM[6]
2742 movdqu @XMM[6], ($out)
2747 .Lxts_dec_bzero: # wipe key schedule [if any]
2748 movdqa %xmm0, 0x00(%rax)
2749 movdqa %xmm0, 0x10(%rax)
2750 lea 0x20(%rax), %rax
2754 lea (%rbp),%rsp # restore %rsp
2756 $code.=<<___ if ($win64);
2757 movaps 0x40(%rbp), %xmm6
2758 movaps 0x50(%rbp), %xmm7
2759 movaps 0x60(%rbp), %xmm8
2760 movaps 0x70(%rbp), %xmm9
2761 movaps 0x80(%rbp), %xmm10
2762 movaps 0x90(%rbp), %xmm11
2763 movaps 0xa0(%rbp), %xmm12
2764 movaps 0xb0(%rbp), %xmm13
2765 movaps 0xc0(%rbp), %xmm14
2766 movaps 0xd0(%rbp), %xmm15
2767 lea 0xa0(%rbp), %rsp
2770 mov 0x48(%rsp), %r15
2771 mov 0x50(%rsp), %r14
2772 mov 0x58(%rsp), %r13
2773 mov 0x60(%rsp), %r12
2774 mov 0x68(%rsp), %rbx
2775 mov 0x70(%rsp), %rax
2776 lea 0x78(%rsp), %rsp
2780 .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
2784 .type _bsaes_const,\@object
2787 .LM0ISR: # InvShiftRows constants
2788 .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
2790 .quad 0x01040b0e0205080f, 0x0306090c00070a0d
2792 .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
2793 .LBS0: # bit-slice constants
2794 .quad 0x5555555555555555, 0x5555555555555555
2796 .quad 0x3333333333333333, 0x3333333333333333
2798 .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
2799 .LSR: # shiftrows constants
2800 .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
2802 .quad 0x0304090e00050a0f, 0x01060b0c0207080d
2804 .quad 0x02060a0e03070b0f, 0x0004080c0105090d
2806 .quad 0x0a0e02060f03070b, 0x0004080c05090d01
2807 .LNOT: # magic constants
2808 .quad 0xffffffffffffffff, 0xffffffffffffffff
2810 .quad 0x6363636363636363, 0x6363636363636363
2811 .LSWPUP: # byte-swap upper dword
2812 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
2814 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
2815 .LADD1: # counter increment constants
2816 .quad 0x0000000000000000, 0x0000000100000000
2818 .quad 0x0000000000000000, 0x0000000200000000
2820 .quad 0x0000000000000000, 0x0000000300000000
2822 .quad 0x0000000000000000, 0x0000000400000000
2824 .quad 0x0000000000000000, 0x0000000500000000
2826 .quad 0x0000000000000000, 0x0000000600000000
2828 .quad 0x0000000000000000, 0x0000000700000000
2830 .quad 0x0000000000000000, 0x0000000800000000
2833 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
2835 .size _bsaes_const,.-_bsaes_const
2838 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
2839 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
2847 .extern __imp_RtlVirtualUnwind
2848 .type se_handler,\@abi-omnipotent
2862 mov 120($context),%rax # pull context->Rax
2863 mov 248($context),%rbx # pull context->Rip
2865 mov 8($disp),%rsi # disp->ImageBase
2866 mov 56($disp),%r11 # disp->HandlerData
2868 mov 0(%r11),%r10d # HandlerData[0]
2869 lea (%rsi,%r10),%r10 # prologue label
2870 cmp %r10,%rbx # context->Rip<prologue label
2873 mov 152($context),%rax # pull context->Rsp
2875 mov 4(%r11),%r10d # HandlerData[1]
2876 lea (%rsi,%r10),%r10 # epilogue label
2877 cmp %r10,%rbx # context->Rip>=epilogue label
2880 mov 160($context),%rax # pull context->Rbp
2882 lea 0x40(%rax),%rsi # %xmm save area
2883 lea 512($context),%rdi # &context.Xmm6
2884 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
2885 .long 0xa548f3fc # cld; rep movsq
2886 lea 0xa0(%rax),%rax # adjust stack pointer
2894 lea 0x78(%rax),%rax # adjust stack pointer
2895 mov %rbx,144($context) # restore context->Rbx
2896 mov %rbp,160($context) # restore context->Rbp
2897 mov %r12,216($context) # restore context->R12
2898 mov %r13,224($context) # restore context->R13
2899 mov %r14,232($context) # restore context->R14
2900 mov %r15,240($context) # restore context->R15
2903 mov %rax,152($context) # restore context->Rsp
2905 mov 40($disp),%rdi # disp->ContextRecord
2906 mov $context,%rsi # context
2907 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
2908 .long 0xa548f3fc # cld; rep movsq
2911 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2912 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2913 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2914 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2915 mov 40(%rsi),%r10 # disp->ContextRecord
2916 lea 56(%rsi),%r11 # &disp->HandlerData
2917 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2918 mov %r10,32(%rsp) # arg5
2919 mov %r11,40(%rsp) # arg6
2920 mov %r12,48(%rsp) # arg7
2921 mov %rcx,56(%rsp) # arg8, (NULL)
2922 call *__imp_RtlVirtualUnwind(%rip)
2924 mov \$1,%eax # ExceptionContinueSearch
2936 .size se_handler,.-se_handler
2941 $code.=<<___ if ($ecb);
2942 .rva .Lecb_enc_prologue
2943 .rva .Lecb_enc_epilogue
2946 .rva .Lecb_dec_prologue
2947 .rva .Lecb_dec_epilogue
2951 .rva .Lcbc_dec_prologue
2952 .rva .Lcbc_dec_epilogue
2955 .rva .Lctr_enc_prologue
2956 .rva .Lctr_enc_epilogue
2959 .rva .Lxts_enc_prologue
2960 .rva .Lxts_enc_epilogue
2963 .rva .Lxts_dec_prologue
2964 .rva .Lxts_dec_epilogue
2970 $code.=<<___ if ($ecb);
2974 .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
2978 .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
2984 .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
2988 .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
2992 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
2996 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
3000 $code =~ s/\`([^\`]*)\`/eval($1)/gem;