3 ###################################################################
4 ### AES-128 [originally in CTR mode] ###
5 ### bitsliced implementation for Intel Core 2 processors ###
6 ### requires support of SSE extensions up to SSSE3 ###
7 ### Author: Emilia Käsper and Peter Schwabe ###
8 ### Date: 2009-03-19 ###
11 ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
12 ### further information. ###
13 ###################################################################
17 # Started as transliteration to "perlasm" the original code has
18 # undergone following changes:
20 # - code was made position-independent;
21 # - rounds were folded into a loop resulting in >5x size reduction
22 # from 12.5KB to 2.2KB;
23 # - above was possibile thanks to mixcolumns() modification that
24 # allowed to feed its output back to aesenc[last], this was
25 # achieved at cost of two additional inter-registers moves;
26 # - some instruction reordering and interleaving;
27 # - this module doesn't implement key setup subroutine, instead it
28 # relies on conversion of "conventional" key schedule as returned
29 # by AES_set_encrypt_key (see discussion below);
30 # - first and last round keys are treated differently, which allowed
31 # to skip one shiftrows(), reduce bit-sliced key schedule and
32 # speed-up conversion by 22%;
33 # - support for 192- and 256-bit keys was added;
35 # Resulting performance in CPU cycles spent to encrypt one byte out
36 # of 4096-byte buffer with 128-bit key is:
38 # Emilia's this(*) difference
40 # Core 2 9.30 8.69 +7%
41 # Nehalem(**) 7.63 6.98 +9%
42 # Atom 17.1 17.4 -2%(***)
44 # (*) Comparison is not completely fair, because "this" is ECB,
45 # i.e. no extra processing such as counter values calculation
46 # and xor-ing input as in Emilia's CTR implementation is
47 # performed. However, the CTR calculations stand for not more
48 # than 1% of total time, so comparison is *rather* fair.
50 # (**) Results were collected on Westmere, which is considered to
51 # be equivalent to Nehalem for this code.
53 # (***) Slowdown on Atom is rather strange per se, because original
54 # implementation has a number of 9+-bytes instructions, which
55 # are bad for Atom front-end, and which I eliminated completely.
56 # In attempt to address deterioration sbox() was tested in FP
57 # SIMD "domain" (movaps instead of movdqa, xorps instead of
58 # pxor, etc.). While it resulted in nominal 4% improvement on
59 # Atom, it hurted Westmere by more than 2x factor.
61 # As for key schedule conversion subroutine. Interface to OpenSSL
62 # relies on per-invocation on-the-fly conversion. This naturally
63 # has impact on performance, especially for short inputs. Conversion
64 # time in CPU cycles and its ratio to CPU cycles spent in 8x block
67 # conversion conversion/8x block
72 # The ratio values mean that 128-byte blocks will be processed
73 # 21-27% slower, 256-byte blocks - 12-16%, 382-byte blocks - 8-11%,
74 # etc. Then keep in mind that input sizes not divisible by 128 are
75 # *effectively* slower, especially shortest ones, e.g. consecutive
76 # 144-byte blocks are processed 44% slower than one would expect,
77 # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
78 # it's still faster than ["hyper-threading-safe" code path in]
79 # aes-x86_64.pl on all lengths above 64 bytes...
85 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
87 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
89 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
90 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
91 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
92 die "can't locate x86_64-xlate.pl";
94 open STDOUT,"| $^X $xlate $flavour $output";
96 my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
97 my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
100 my ($key,$rounds,$const)=("%rax","%r10d","%r11");
103 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
104 # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
109 &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
110 &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
114 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
115 # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
137 # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
138 # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
158 #;*************************************************************
159 #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
160 #;*************************************************************
161 my ($x0,$x1,$y0,$y1,$t0)=@_;
174 sub Mul_GF4_N { # not used, see next subroutine
175 # multiply and scale by N
176 my ($x0,$x1,$y0,$y1,$t0)=@_;
190 # interleaved Mul_GF4_N and Mul_GF4
191 my ($x0,$x1,$y0,$y1,$t0,
192 $x2,$x3,$y2,$y3,$t1)=@_;
220 &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
227 Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
228 @x[2], @x[3], @y[2], @y[3], @t[2]);
240 &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
241 @x[6], @x[7], @y[2], @y[3], @t[2]);
246 &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
255 #;********************************************************************
256 #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
257 #;********************************************************************
261 # direct optimizations from hardware
316 #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
318 # new smaller inversion
352 # output in s3, s2, s1, t1
354 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
356 # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
357 &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
359 ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
362 # AES linear components
368 pxor 0x00($key),@x[0]
369 pxor 0x10($key),@x[1]
371 pxor 0x20($key),@x[2]
373 pxor 0x30($key),@x[3]
375 pxor 0x40($key),@x[4]
377 pxor 0x50($key),@x[5]
379 pxor 0x60($key),@x[6]
381 pxor 0x70($key),@x[7]
389 # modified to emit output in order suitable for feeding back to aesenc[last]
393 pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
394 pshufd \$0x93, @x[1], @t[1]
395 pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
396 pshufd \$0x93, @x[2], @t[2]
398 pshufd \$0x93, @x[3], @t[3]
400 pshufd \$0x93, @x[4], @t[4]
402 pshufd \$0x93, @x[5], @t[5]
404 pshufd \$0x93, @x[6], @t[6]
406 pshufd \$0x93, @x[7], @t[7]
413 pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
415 pshufd \$0x4E, @x[1], @x[1]
421 pshufd \$0x4E, @x[4], @t[0]
423 pshufd \$0x4E, @x[5], @t[1]
425 pshufd \$0x4E, @x[3], @x[4]
427 pshufd \$0x4E, @x[7], @x[5]
429 pshufd \$0x4E, @x[6], @x[3]
431 pshufd \$0x4E, @x[2], @x[6]
443 sub aesenc { # not used
447 movdqa 0x30($const),@t[0] # .LSR
449 &shiftrows (@b,@t[0]);
451 &mixcolumns (@b[0,1,4,6,3,7,2,5],@t);
454 sub aesenclast { # not used
458 movdqa 0x40($const),@t[0] # .LSRM0
460 &shiftrows (@b,@t[0]);
463 pxor 0x00($key),@b[0]
464 pxor 0x10($key),@b[1]
465 pxor 0x20($key),@b[4]
466 pxor 0x30($key),@b[6]
467 pxor 0x40($key),@b[3]
468 pxor 0x50($key),@b[7]
469 pxor 0x60($key),@b[2]
470 pxor 0x70($key),@b[5]
475 my ($a,$b,$n,$mask,$t)=@_;
487 my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
507 my @x=reverse(@_[0..7]);
508 my ($t0,$t1,$t2,$t3)=@_[8..11];
510 movdqa 0x00($const),$t0 # .LBS0
511 movdqa 0x10($const),$t1 # .LBS1
513 &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
514 &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
516 movdqa 0x20($const),$t0 # .LBS2
518 &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
519 &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
521 &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
522 &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
530 .type _bsaes_encrypt8,\@abi-omnipotent
533 lea .LBS0(%rip), $const # constants table
535 movdqa ($key), @XMM[9] # round 0 key
537 movdqa 0x60($const), @XMM[8] # .LM0SR
538 pxor @XMM[9], @XMM[0] # xor with round0 key
539 pxor @XMM[9], @XMM[1]
540 pshufb @XMM[8], @XMM[0]
541 pxor @XMM[9], @XMM[2]
542 pshufb @XMM[8], @XMM[1]
543 pxor @XMM[9], @XMM[3]
544 pshufb @XMM[8], @XMM[2]
545 pxor @XMM[9], @XMM[4]
546 pshufb @XMM[8], @XMM[3]
547 pxor @XMM[9], @XMM[5]
548 pshufb @XMM[8], @XMM[4]
549 pxor @XMM[9], @XMM[6]
550 pshufb @XMM[8], @XMM[5]
551 pxor @XMM[9], @XMM[7]
552 pshufb @XMM[8], @XMM[6]
553 pshufb @XMM[8], @XMM[7]
554 _bsaes_encrypt8_bitslice:
556 &bitslice (@XMM[0..7, 8..11]);
563 &shiftrows (@XMM[0..7, 8]);
564 $code.=".Lenc_sbox:\n";
565 &sbox (@XMM[0..7, 8..15]);
570 &mixcolumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
572 movdqa 0x30($const), @XMM[8] # .LSR
574 movdqa 0x40($const), @XMM[8] # .LSRM0
579 # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
580 &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
582 movdqa ($key), @XMM[8] # last round key
583 pxor @XMM[8], @XMM[0]
584 pxor @XMM[8], @XMM[1]
585 pxor @XMM[8], @XMM[4]
586 pxor @XMM[8], @XMM[6]
587 pxor @XMM[8], @XMM[3]
588 pxor @XMM[8], @XMM[7]
589 pxor @XMM[8], @XMM[2]
590 pxor @XMM[8], @XMM[5]
592 .size _bsaes_encrypt8,.-_bsaes_encrypt8
596 my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
599 my @x=reverse(@_[0..7]);
600 my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
602 &swapmove (@x[0,1],1,$bs0,$t2,$t3);
604 #&swapmove(@x[2,3],1,$t0,$t2,$t3);
608 #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
610 &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
612 #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
618 &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
619 &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
623 .type _bsaes_enc_key_convert,\@abi-omnipotent
625 _bsaes_enc_key_convert:
626 lea .LBS1(%rip), $const
627 movdqu ($inp), %xmm7 # load round 0 key
628 movdqa -0x10($const), %xmm8 # .LBS0
629 movdqa 0x00($const), %xmm9 # .LBS1
630 movdqa 0x10($const), %xmm10 # .LBS2
631 movdqa 0x40($const), %xmm13 # .LM0
632 movdqa 0x60($const),%xmm14 # .LNOT
634 movdqu 0x10($inp), %xmm6 # load round 1 key
636 movdqa %xmm7, ($out) # save round 0 key
645 &bitslice_key (map("%xmm$_",(0..7, 8..12)));
647 pxor %xmm14, %xmm5 # "pnot"
652 movdqa %xmm0, 0x00($out) # write bit-sliced round key
653 movdqa %xmm1, 0x10($out)
654 movdqa %xmm2, 0x20($out)
655 movdqa %xmm3, 0x30($out)
656 movdqa %xmm4, 0x40($out)
657 movdqa %xmm5, 0x50($out)
658 movdqa %xmm6, 0x60($out)
659 movdqa %xmm7, 0x70($out)
661 movdqu ($inp), %xmm6 # load next round key
665 pxor 0x70($const), %xmm6 # .L63
666 movdqa %xmm6, ($out) # save last round key
668 .size _bsaes_enc_key_convert,.-_bsaes_enc_key_convert
672 if (1 && !$win64) { # following two functions are unsupported interface
673 # used for benchmarking...
675 .globl bsaes_enc_key_convert
676 .type bsaes_enc_key_convert,\@function,2
678 bsaes_enc_key_convert:
679 mov 240($inp),%r10d # pass rounds
680 mov $inp,%rcx # pass key
681 mov $out,%rax # pass key schedule
682 call _bsaes_enc_key_convert
684 .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
686 .globl bsaes_encrypt_128
687 .type bsaes_encrypt_128,\@function,4
691 movdqu 0x00($inp), @XMM[0] # load input
692 movdqu 0x10($inp), @XMM[1]
693 movdqu 0x20($inp), @XMM[2]
694 movdqu 0x30($inp), @XMM[3]
695 movdqu 0x40($inp), @XMM[4]
696 movdqu 0x50($inp), @XMM[5]
697 movdqu 0x60($inp), @XMM[6]
698 movdqu 0x70($inp), @XMM[7]
699 mov $key, %rax # pass the $key
705 movdqu @XMM[0], 0x00($out) # write output
706 movdqu @XMM[1], 0x10($out)
707 movdqu @XMM[4], 0x20($out)
708 movdqu @XMM[6], 0x30($out)
709 movdqu @XMM[3], 0x40($out)
710 movdqu @XMM[7], 0x50($out)
711 movdqu @XMM[2], 0x60($out)
712 movdqu @XMM[5], 0x70($out)
717 .size bsaes_encrypt_128,.-bsaes_encrypt_128
721 ######################################################################
725 my ($arg1,$arg2,$arg3,$arg4,$arg5) = $win64 ? ("%rcx","%rdx","%r8","%r9","%r10")
726 : ("%rdi","%rsi","%rdx","%rcx","%r8");
727 my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
730 .globl bsaes_ecb_encrypt_blocks
731 .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
733 bsaes_ecb_encrypt_blocks:
742 $code.=<<___ if ($win64);
743 lea -0xa0(%rsp), %rsp
744 movaps %xmm6, 0x40(%rsp)
745 movaps %xmm7, 0x50(%rsp)
746 movaps %xmm8, 0x60(%rsp)
747 movaps %xmm9, 0x70(%rsp)
748 movaps %xmm10, 0x80(%rsp)
749 movaps %xmm11, 0x90(%rsp)
750 movaps %xmm12, 0xa0(%rsp)
751 movaps %xmm13, 0xb0(%rsp)
752 movaps %xmm14, 0xc0(%rsp)
753 movaps %xmm15, 0xd0(%rsp)
757 mov %rsp,%rbp # backup %rsp
758 mov 240($arg4),%eax # rounds
759 mov $arg1,$inp # backup arguments
766 mov %eax,%ebx # backup rounds
767 shl \$7,%rax # 128 bytes per inner round key
768 sub \$`128-32`,%rax # size of bit-sliced key schedule
770 mov %rsp,%rax # pass key schedule
771 mov $key,%rcx # pass key
772 mov %ebx,%r10d # pass rounds
773 call _bsaes_enc_key_convert
777 movdqu 0x00($inp), @XMM[0] # load input
778 movdqu 0x10($inp), @XMM[1]
779 movdqu 0x20($inp), @XMM[2]
780 movdqu 0x30($inp), @XMM[3]
781 movdqu 0x40($inp), @XMM[4]
782 movdqu 0x50($inp), @XMM[5]
783 mov %rsp, %rax # pass key schedule
784 movdqu 0x60($inp), @XMM[6]
785 mov %ebx,%r10d # pass rounds
786 movdqu 0x70($inp), @XMM[7]
791 movdqu @XMM[0], 0x00($out) # write output
792 movdqu @XMM[1], 0x10($out)
793 movdqu @XMM[4], 0x20($out)
794 movdqu @XMM[6], 0x30($out)
795 movdqu @XMM[3], 0x40($out)
796 movdqu @XMM[7], 0x50($out)
797 movdqu @XMM[2], 0x60($out)
798 movdqu @XMM[5], 0x70($out)
806 movdqu 0x00($inp), @XMM[0] # load input
807 mov %rsp, %rax # pass key schedule
808 mov %ebx,%r10d # pass rounds
811 movdqu 0x10($inp), @XMM[1]
813 movdqu 0x20($inp), @XMM[2]
816 movdqu 0x30($inp), @XMM[3]
818 movdqu 0x40($inp), @XMM[4]
821 movdqu 0x50($inp), @XMM[5]
823 movdqu 0x60($inp), @XMM[6]
825 movdqu @XMM[0], 0x00($out) # write output
826 movdqu @XMM[1], 0x10($out)
827 movdqu @XMM[4], 0x20($out)
828 movdqu @XMM[6], 0x30($out)
829 movdqu @XMM[3], 0x40($out)
830 movdqu @XMM[7], 0x50($out)
831 movdqu @XMM[2], 0x60($out)
836 movdqu @XMM[0], 0x00($out) # write output
837 movdqu @XMM[1], 0x10($out)
838 movdqu @XMM[4], 0x20($out)
839 movdqu @XMM[6], 0x30($out)
840 movdqu @XMM[3], 0x40($out)
841 movdqu @XMM[7], 0x50($out)
846 movdqu @XMM[0], 0x00($out) # write output
847 movdqu @XMM[1], 0x10($out)
848 movdqu @XMM[4], 0x20($out)
849 movdqu @XMM[6], 0x30($out)
850 movdqu @XMM[3], 0x40($out)
855 movdqu @XMM[0], 0x00($out) # write output
856 movdqu @XMM[1], 0x10($out)
857 movdqu @XMM[4], 0x20($out)
858 movdqu @XMM[6], 0x30($out)
863 movdqu @XMM[0], 0x00($out) # write output
864 movdqu @XMM[1], 0x10($out)
865 movdqu @XMM[4], 0x20($out)
870 movdqu @XMM[0], 0x00($out) # write output
871 movdqu @XMM[1], 0x10($out)
876 movdqu @XMM[0], 0x00($out) # write output
892 .Lecb_enc_bzero: # wipe key schedule [if any]
893 movdqa %xmm0, 0x00(%rax)
894 movdqa %xmm0, 0x10(%rax)
899 lea (%rbp),%rsp # restore %rsp
901 $code.=<<___ if ($win64);
902 movaps 0x40(%rbp), %xmm6
903 movaps 0x50(%rbp), %xmm7
904 movaps 0x60(%rbp), %xmm8
905 movaps 0x70(%rbp), %xmm9
906 movaps 0x80(%rbp), %xmm10
907 movaps 0x90(%rbp), %xmm11
908 movaps 0xa0(%rbp), %xmm12
909 movaps 0xb0(%rbp), %xmm13
910 movaps 0xc0(%rbp), %xmm14
911 movaps 0xd0(%rbp), %xmm15
924 .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
926 .globl bsaes_ctr32_encrypt_blocks
927 .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
929 bsaes_ctr32_encrypt_blocks:
936 lea -0x48(%rsp), %rsp
938 $code.=<<___ if ($win64);
939 mov 0xa0(%rsp),$arg5 # pull ivp
940 lea -0xa0(%rsp), %rsp
941 movaps %xmm6, 0x40(%rsp)
942 movaps %xmm7, 0x50(%rsp)
943 movaps %xmm8, 0x60(%rsp)
944 movaps %xmm9, 0x70(%rsp)
945 movaps %xmm10, 0x80(%rsp)
946 movaps %xmm11, 0x90(%rsp)
947 movaps %xmm12, 0xa0(%rsp)
948 movaps %xmm13, 0xb0(%rsp)
949 movaps %xmm14, 0xc0(%rsp)
950 movaps %xmm15, 0xd0(%rsp)
954 mov %rsp, %rbp # backup %rsp
955 movdqu ($arg5), %xmm0 # load counter
956 mov 240($arg4), %eax # rounds
957 mov $arg1, $inp # backup arguments
961 movdqa %xmm0, 0x20(%rbp) # copy counter
965 mov %eax, %ebx # rounds
966 shl \$7, %rax # 128 bytes per inner round key
967 sub \$`128-32`, %rax # size of bit-sliced key schedule
970 mov %rsp, %rax # pass key schedule
971 mov $key, %rcx # pass key
972 mov %ebx, %r10d # pass rounds
973 call _bsaes_enc_key_convert
975 movdqa (%rsp), @XMM[9] # load round0 key
976 lea .LADD1(%rip), %r11
977 movdqa 0x20(%rbp), @XMM[0] # counter copy
978 movdqa -0x20(%r11), @XMM[8] # .LSWPUP
979 pshufb @XMM[8], @XMM[9] # byte swap upper part
980 pshufb @XMM[8], @XMM[0]
981 movdqa @XMM[9], (%rsp) # save adjusted round0 key
985 movdqa @XMM[0], 0x20(%rbp) # save counter
986 movdqa @XMM[0], @XMM[1] # prepare 8 counter values
987 movdqa @XMM[0], @XMM[2]
988 paddd 0x00(%r11), @XMM[1] # .LADD1
989 movdqa @XMM[0], @XMM[3]
990 paddd 0x10(%r11), @XMM[2] # .LADD2
991 movdqa @XMM[0], @XMM[4]
992 paddd 0x20(%r11), @XMM[3] # .LADD3
993 movdqa @XMM[0], @XMM[5]
994 paddd 0x30(%r11), @XMM[4] # .LADD4
995 movdqa @XMM[0], @XMM[6]
996 paddd 0x40(%r11), @XMM[5] # .LADD5
997 movdqa @XMM[0], @XMM[7]
998 paddd 0x50(%r11), @XMM[6] # .LADD6
999 paddd 0x60(%r11), @XMM[7] # .LADD7
1001 # Borrow prologue from _bsaes_encrypt8 to use the opportunity
1002 # to flip byte order in 32-bit counter
1003 movdqa (%rsp), @XMM[9] # round 0 key
1004 lea 0x10(%rsp), %rax # pass key schedule
1005 movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
1006 pxor @XMM[9], @XMM[0] # xor with round0 key
1007 pxor @XMM[9], @XMM[1]
1008 pshufb @XMM[8], @XMM[0]
1009 pxor @XMM[9], @XMM[2]
1010 pshufb @XMM[8], @XMM[1]
1011 pxor @XMM[9], @XMM[3]
1012 pshufb @XMM[8], @XMM[2]
1013 pxor @XMM[9], @XMM[4]
1014 pshufb @XMM[8], @XMM[3]
1015 pxor @XMM[9], @XMM[5]
1016 pshufb @XMM[8], @XMM[4]
1017 pxor @XMM[9], @XMM[6]
1018 pshufb @XMM[8], @XMM[5]
1019 pxor @XMM[9], @XMM[7]
1020 pshufb @XMM[8], @XMM[6]
1021 lea .LBS0(%rip), %r11 # constants table
1022 pshufb @XMM[8], @XMM[7]
1023 mov %ebx,%r10d # pass rounds
1025 call _bsaes_encrypt8_bitslice
1028 jc .Lctr_enc_loop_done
1030 movdqu 0x00($inp), @XMM[8] # load input
1031 movdqu 0x10($inp), @XMM[9]
1032 movdqu 0x20($inp), @XMM[10]
1033 movdqu 0x30($inp), @XMM[11]
1034 movdqu 0x40($inp), @XMM[12]
1035 movdqu 0x50($inp), @XMM[13]
1036 movdqu 0x60($inp), @XMM[14]
1037 movdqu 0x70($inp), @XMM[15]
1039 pxor @XMM[0], @XMM[8]
1040 movdqa 0x20(%rbp), @XMM[0] # load counter
1041 pxor @XMM[9], @XMM[1]
1042 movdqu @XMM[8], 0x00($out) # write output
1043 pxor @XMM[10], @XMM[4]
1044 movdqu @XMM[1], 0x10($out)
1045 pxor @XMM[11], @XMM[6]
1046 movdqu @XMM[4], 0x20($out)
1047 pxor @XMM[12], @XMM[3]
1048 movdqu @XMM[6], 0x30($out)
1049 pxor @XMM[13], @XMM[7]
1050 movdqu @XMM[3], 0x40($out)
1051 pxor @XMM[14], @XMM[2]
1052 movdqu @XMM[7], 0x50($out)
1053 pxor @XMM[15], @XMM[5]
1054 movdqu @XMM[2], 0x60($out)
1055 lea .LADD1(%rip), %r11
1056 movdqu @XMM[5], 0x70($out)
1057 lea 0x80($out), $out
1058 paddd 0x70(%r11), @XMM[0] # .LADD8
1063 .Lctr_enc_loop_done:
1064 movdqu 0x00($inp), @XMM[8] # load input
1065 pxor @XMM[8], @XMM[0]
1066 movdqu @XMM[0], 0x00($out) # write output
1069 movdqu 0x10($inp), @XMM[9]
1070 pxor @XMM[9], @XMM[1]
1071 movdqu @XMM[1], 0x10($out)
1073 movdqu 0x20($inp), @XMM[10]
1074 pxor @XMM[10], @XMM[4]
1075 movdqu @XMM[4], 0x20($out)
1078 movdqu 0x30($inp), @XMM[11]
1079 pxor @XMM[11], @XMM[6]
1080 movdqu @XMM[6], 0x30($out)
1082 movdqu 0x40($inp), @XMM[12]
1083 pxor @XMM[12], @XMM[3]
1084 movdqu @XMM[3], 0x40($out)
1087 movdqu 0x50($inp), @XMM[13]
1088 pxor @XMM[13], @XMM[7]
1089 movdqu @XMM[7], 0x50($out)
1091 movdqu 0x60($inp), @XMM[14]
1092 pxor @XMM[14], @XMM[2]
1093 movdqu @XMM[2], 0x60($out)
1098 lea 0x20(%rbp), $arg1
1099 lea 0x30(%rbp), $arg2
1102 movdqu ($inp), @XMM[1]
1104 mov 0x2c(%rbp), %eax # load 32-bit counter
1106 pxor 0x30(%rbp), @XMM[1]
1107 inc %eax # increment
1108 movdqu @XMM[1], ($out)
1111 mov %eax, 0x2c(%rsp) # save 32-bit counter
1118 .Lctr_enc_bzero: # wipe key schedule [if any]
1119 movdqa %xmm0, 0x00(%rax)
1120 movdqa %xmm0, 0x10(%rax)
1121 lea 0x20(%rax), %rax
1125 lea (%rbp),%rsp # restore %rsp
1127 $code.=<<___ if ($win64);
1128 movaps 0x40(%rbp), %xmm6
1129 movaps 0x50(%rbp), %xmm7
1130 movaps 0x60(%rbp), %xmm8
1131 movaps 0x70(%rbp), %xmm9
1132 movaps 0x80(%rbp), %xmm10
1133 movaps 0x90(%rbp), %xmm11
1134 movaps 0xa0(%rbp), %xmm12
1135 movaps 0xb0(%rbp), %xmm13
1136 movaps 0xc0(%rbp), %xmm14
1137 movaps 0xd0(%rbp), %xmm15
1138 lea 0xa0(%rbp), %rsp
1141 mov 0x48(%rsp), %r15
1142 mov 0x50(%rsp), %r14
1143 mov 0x58(%rsp), %r13
1144 mov 0x60(%rsp), %r12
1145 mov 0x68(%rsp), %rbx
1146 mov 0x70(%rsp), %rbp
1147 lea 0x78(%rsp), %rsp
1150 .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
1155 .LBS0: .quad 0x5555555555555555, 0x5555555555555555
1156 .LBS1: .quad 0x3333333333333333, 0x3333333333333333
1157 .LBS2: .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
1158 .LSR: .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
1159 .LSRM0: .quad 0x0304090e00050a0f, 0x01060b0c0207080d
1160 .LM0: .quad 0x02060a0e03070b0f, 0x0004080c0105090d
1161 .LM0SR: .quad 0x0a0e02060f03070b, 0x0004080c05090d01
1162 .LNOT: .quad 0xffffffffffffffff, 0xffffffffffffffff
1163 .L63: .quad 0x6363636363636363, 0x6363636363636363
1165 .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
1167 .quad 0x0a0d02060c03070b, 0x0004080f05090e01
1168 .LADD1: .quad 0x0000000000000000, 0x0000000100000000
1169 .LADD2: .quad 0x0000000000000000, 0x0000000200000000
1170 .LADD3: .quad 0x0000000000000000, 0x0000000300000000
1171 .LADD4: .quad 0x0000000000000000, 0x0000000400000000
1172 .LADD5: .quad 0x0000000000000000, 0x0000000500000000
1173 .LADD6: .quad 0x0000000000000000, 0x0000000600000000
1174 .LADD7: .quad 0x0000000000000000, 0x0000000700000000
1175 .LADD8: .quad 0x0000000000000000, 0x0000000800000000
1176 .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper and Peter Schwabe"
1180 $code =~ s/\`([^\`]*)\`/eval($1)/gem;