2 # Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # This module implements support for Intel AES-NI extension. In
18 # OpenSSL context it's used with Intel engine, but can also be used as
19 # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
24 # Given aes(enc|dec) instructions' latency asymptotic performance for
25 # non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
26 # processed with 128-bit key. And given their throughput asymptotic
27 # performance for parallelizable modes is 1.25 cycles per byte. Being
28 # asymptotic limit it's not something you commonly achieve in reality,
29 # but how close does one get? Below are results collected for
30 # different modes and block sized. Pairs of numbers are for en-/
33 # 16-byte 64-byte 256-byte 1-KB 8-KB
34 # ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
35 # CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
36 # CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
37 # CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
38 # OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
39 # CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
41 # ECB, CTR, CBC and CCM results are free from EVP overhead. This means
42 # that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
43 # [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
44 # The results were collected with specially crafted speed.c benchmark
45 # in order to compare them with results reported in "Intel Advanced
46 # Encryption Standard (AES) New Instruction Set" White Paper Revision
47 # 3.0 dated May 2010. All above results are consistently better. This
48 # module also provides better performance for block sizes smaller than
49 # 128 bytes in points *not* represented in the above table.
51 # Looking at the results for 8-KB buffer.
53 # CFB and OFB results are far from the limit, because implementation
54 # uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
55 # single-block aesni_encrypt, which is not the most optimal way to go.
56 # CBC encrypt result is unexpectedly high and there is no documented
57 # explanation for it. Seemingly there is a small penalty for feeding
58 # the result back to AES unit the way it's done in CBC mode. There is
59 # nothing one can do and the result appears optimal. CCM result is
60 # identical to CBC, because CBC-MAC is essentially CBC encrypt without
61 # saving output. CCM CTR "stays invisible," because it's neatly
62 # interleaved with CBC-MAC. This provides ~30% improvement over
63 # "straightforward" CCM implementation with CTR and CBC-MAC performed
64 # disjointly. Parallelizable modes practically achieve the theoretical
67 # Looking at how results vary with buffer size.
69 # Curves are practically saturated at 1-KB buffer size. In most cases
70 # "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
71 # CTR curve doesn't follow this pattern and is "slowest" changing one
72 # with "256-byte" result being 87% of "8-KB." This is because overhead
73 # in CTR mode is most computationally intensive. Small-block CCM
74 # decrypt is slower than encrypt, because first CTR and last CBC-MAC
75 # iterations can't be interleaved.
77 # Results for 192- and 256-bit keys.
79 # EVP-free results were observed to scale perfectly with number of
80 # rounds for larger block sizes, i.e. 192-bit result being 10/12 times
81 # lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
82 # are a tad smaller, because the above mentioned penalty biases all
83 # results by same constant value. In similar way function call
84 # overhead affects small-block performance, as well as OFB and CFB
85 # results. Differences are not large, most common coefficients are
86 # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
87 # observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
91 # While Westmere processor features 6 cycles latency for aes[enc|dec]
92 # instructions, which can be scheduled every second cycle, Sandy
93 # Bridge spends 8 cycles per instruction, but it can schedule them
94 # every cycle. This means that code targeting Westmere would perform
95 # suboptimally on Sandy Bridge. Therefore this update.
97 # In addition, non-parallelizable CBC encrypt (as well as CCM) is
98 # optimized. Relative improvement might appear modest, 8% on Westmere,
99 # but in absolute terms it's 3.77 cycles per byte encrypted with
100 # 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
101 # should be compared to asymptotic limits of 3.75 for Westmere and
102 # 5.00 for Sandy Bridge. Actually, the fact that they get this close
103 # to asymptotic limits is quite amazing. Indeed, the limit is
104 # calculated as latency times number of rounds, 10 for 128-bit key,
105 # and divided by 16, the number of bytes in block, or in other words
106 # it accounts *solely* for aesenc instructions. But there are extra
107 # instructions, and numbers so close to the asymptotic limits mean
108 # that it's as if it takes as little as *one* additional cycle to
109 # execute all of them. How is it possible? It is possible thanks to
110 # out-of-order execution logic, which manages to overlap post-
111 # processing of previous block, things like saving the output, with
112 # actual encryption of current block, as well as pre-processing of
113 # current block, things like fetching input and xor-ing it with
114 # 0-round element of the key schedule, with actual encryption of
115 # previous block. Keep this in mind...
117 # For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
118 # performance is achieved by interleaving instructions working on
119 # independent blocks. In which case asymptotic limit for such modes
120 # can be obtained by dividing above mentioned numbers by AES
121 # instructions' interleave factor. Westmere can execute at most 3
122 # instructions at a time, meaning that optimal interleave factor is 3,
123 # and that's where the "magic" number of 1.25 come from. "Optimal
124 # interleave factor" means that increase of interleave factor does
125 # not improve performance. The formula has proven to reflect reality
126 # pretty well on Westmere... Sandy Bridge on the other hand can
127 # execute up to 8 AES instructions at a time, so how does varying
128 # interleave factor affect the performance? Here is table for ECB
129 # (numbers are cycles per byte processed with 128-bit key):
131 # instruction interleave factor 3x 6x 8x
132 # theoretical asymptotic limit 1.67 0.83 0.625
133 # measured performance for 8KB block 1.05 0.86 0.84
135 # "as if" interleave factor 4.7x 5.8x 6.0x
137 # Further data for other parallelizable modes:
139 # CBC decrypt 1.16 0.93 0.74
142 # Well, given 3x column it's probably inappropriate to call the limit
143 # asymptotic, if it can be surpassed, isn't it? What happens there?
144 # Rewind to CBC paragraph for the answer. Yes, out-of-order execution
145 # magic is responsible for this. Processor overlaps not only the
146 # additional instructions with AES ones, but even AES instructions
147 # processing adjacent triplets of independent blocks. In the 6x case
148 # additional instructions still claim disproportionally small amount
149 # of additional cycles, but in 8x case number of instructions must be
150 # a tad too high for out-of-order logic to cope with, and AES unit
151 # remains underutilized... As you can see 8x interleave is hardly
152 # justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
153 # utilizes 6x interleave because of limited register bank capacity.
155 # Higher interleave factors do have negative impact on Westmere
156 # performance. While for ECB mode it's negligible ~1.5%, other
157 # parallelizables perform ~5% worse, which is outweighed by ~25%
158 # improvement on Sandy Bridge. To balance regression on Westmere
159 # CTR mode was implemented with 6x aesenc interleave factor.
163 # Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
164 # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
165 # in CTR mode AES instruction interleave factor was chosen to be 6x.
169 # Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
172 ######################################################################
173 # Current large-block performance in cycles per byte processed with
174 # 128-bit key (less is better).
176 # CBC en-/decrypt CTR XTS ECB OCB
177 # Westmere 3.77/1.25 1.25 1.25 1.26
178 # * Bridge 5.07/0.74 0.75 0.90 0.85 0.98
179 # Haswell 4.44/0.63 0.63 0.73 0.63 0.70
180 # Skylake 2.62/0.63 0.63 0.63 0.63
181 # Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11
182 # Knights L 2.54/0.77 0.78 0.85 - 1.50
183 # Goldmont 3.82/1.26 1.26 1.29 1.29 1.50
184 # Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95
185 # Ryzen 2.71/0.35 0.35 0.44 0.38 0.49
187 # (*) Atom Silvermont ECB result is suboptimal because of penalties
188 # incurred by operations on %xmm8-15. As ECB is not considered
189 # critical, nothing was done to mitigate the problem.
191 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
192 # generates drop-in replacement for
193 # crypto/aes/asm/aes-x86_64.pl:-)
195 # $output is the last argument if it looks like a file (it has an extension)
196 # $flavour is the first argument if it doesn't look like a file
197 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
198 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
200 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
202 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
203 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
204 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
205 die "can't locate x86_64-xlate.pl";
207 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
208 or die "can't call $xlate: $!";
211 $movkey = $PREFIX eq "aesni" ? "movups" : "movups";
212 @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
213 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
216 $code.=".extern OPENSSL_ia32cap_P\n";
218 $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
219 # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
223 $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
224 $ivp="%r8"; # cbc, ctr, ...
226 $rnds_="%r10d"; # backup copy for $rounds
227 $key_="%r11"; # backup copy for $key
229 # %xmm register layout
230 $rndkey0="%xmm0"; $rndkey1="%xmm1";
231 $inout0="%xmm2"; $inout1="%xmm3";
232 $inout2="%xmm4"; $inout3="%xmm5";
233 $inout4="%xmm6"; $inout5="%xmm7";
234 $inout6="%xmm8"; $inout7="%xmm9";
236 $in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
237 $in0="%xmm8"; $iv="%xmm9";
239 # Inline version of internal aesni_[en|de]crypt1.
241 # Why folded loop? Because aes[enc|dec] is slow enough to accommodate
242 # cycles which take care of loop variables...
244 sub aesni_generate1 {
245 my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
248 $movkey ($key),$rndkey0
249 $movkey 16($key),$rndkey1
251 $code.=<<___ if (defined($ivec));
256 $code.=<<___ if (!defined($ivec));
258 xorps $rndkey0,$inout
262 aes${p} $rndkey1,$inout
264 $movkey ($key),$rndkey1
266 jnz .Loop_${p}1_$sn # loop body is 16 bytes
267 aes${p}last $rndkey1,$inout
270 # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
272 { my ($inp,$out,$key) = @_4args;
275 .globl ${PREFIX}_encrypt
276 .type ${PREFIX}_encrypt,\@abi-omnipotent
280 movups ($inp),$inout0 # load input
281 mov 240($key),$rounds # key->rounds
283 &aesni_generate1("enc",$key,$rounds);
285 pxor $rndkey0,$rndkey0 # clear register bank
286 pxor $rndkey1,$rndkey1
287 movups $inout0,($out) # output
291 .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
293 .globl ${PREFIX}_decrypt
294 .type ${PREFIX}_decrypt,\@abi-omnipotent
298 movups ($inp),$inout0 # load input
299 mov 240($key),$rounds # key->rounds
301 &aesni_generate1("dec",$key,$rounds);
303 pxor $rndkey0,$rndkey0 # clear register bank
304 pxor $rndkey1,$rndkey1
305 movups $inout0,($out) # output
309 .size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
313 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
314 # factor. Why 3x subroutine were originally used in loops? Even though
315 # aes[enc|dec] latency was originally 6, it could be scheduled only
316 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
317 # utilization, i.e. when subroutine's throughput is virtually same as
318 # of non-interleaved subroutine [for number of input blocks up to 3].
319 # This is why it originally made no sense to implement 2x subroutine.
320 # But times change and it became appropriate to spend extra 192 bytes
321 # on 2x subroutine on Atom Silvermont account. For processors that
322 # can schedule aes[enc|dec] every cycle optimal interleave factor
323 # equals to corresponding instructions latency. 8x is optimal for
324 # * Bridge and "super-optimal" for other Intel CPUs...
326 sub aesni_generate2 {
328 # As already mentioned it takes in $key and $rounds, which are *not*
329 # preserved. $inout[0-1] is cipher/clear text...
331 .type _aesni_${dir}rypt2,\@abi-omnipotent
335 $movkey ($key),$rndkey0
337 $movkey 16($key),$rndkey1
338 xorps $rndkey0,$inout0
339 xorps $rndkey0,$inout1
340 $movkey 32($key),$rndkey0
341 lea 32($key,$rounds),$key
346 aes${dir} $rndkey1,$inout0
347 aes${dir} $rndkey1,$inout1
348 $movkey ($key,%rax),$rndkey1
350 aes${dir} $rndkey0,$inout0
351 aes${dir} $rndkey0,$inout1
352 $movkey -16($key,%rax),$rndkey0
355 aes${dir} $rndkey1,$inout0
356 aes${dir} $rndkey1,$inout1
357 aes${dir}last $rndkey0,$inout0
358 aes${dir}last $rndkey0,$inout1
361 .size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
364 sub aesni_generate3 {
366 # As already mentioned it takes in $key and $rounds, which are *not*
367 # preserved. $inout[0-2] is cipher/clear text...
369 .type _aesni_${dir}rypt3,\@abi-omnipotent
373 $movkey ($key),$rndkey0
375 $movkey 16($key),$rndkey1
376 xorps $rndkey0,$inout0
377 xorps $rndkey0,$inout1
378 xorps $rndkey0,$inout2
379 $movkey 32($key),$rndkey0
380 lea 32($key,$rounds),$key
385 aes${dir} $rndkey1,$inout0
386 aes${dir} $rndkey1,$inout1
387 aes${dir} $rndkey1,$inout2
388 $movkey ($key,%rax),$rndkey1
390 aes${dir} $rndkey0,$inout0
391 aes${dir} $rndkey0,$inout1
392 aes${dir} $rndkey0,$inout2
393 $movkey -16($key,%rax),$rndkey0
396 aes${dir} $rndkey1,$inout0
397 aes${dir} $rndkey1,$inout1
398 aes${dir} $rndkey1,$inout2
399 aes${dir}last $rndkey0,$inout0
400 aes${dir}last $rndkey0,$inout1
401 aes${dir}last $rndkey0,$inout2
404 .size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
407 # 4x interleave is implemented to improve small block performance,
408 # most notably [and naturally] 4 block by ~30%. One can argue that one
409 # should have implemented 5x as well, but improvement would be <20%,
410 # so it's not worth it...
411 sub aesni_generate4 {
413 # As already mentioned it takes in $key and $rounds, which are *not*
414 # preserved. $inout[0-3] is cipher/clear text...
416 .type _aesni_${dir}rypt4,\@abi-omnipotent
420 $movkey ($key),$rndkey0
422 $movkey 16($key),$rndkey1
423 xorps $rndkey0,$inout0
424 xorps $rndkey0,$inout1
425 xorps $rndkey0,$inout2
426 xorps $rndkey0,$inout3
427 $movkey 32($key),$rndkey0
428 lea 32($key,$rounds),$key
434 aes${dir} $rndkey1,$inout0
435 aes${dir} $rndkey1,$inout1
436 aes${dir} $rndkey1,$inout2
437 aes${dir} $rndkey1,$inout3
438 $movkey ($key,%rax),$rndkey1
440 aes${dir} $rndkey0,$inout0
441 aes${dir} $rndkey0,$inout1
442 aes${dir} $rndkey0,$inout2
443 aes${dir} $rndkey0,$inout3
444 $movkey -16($key,%rax),$rndkey0
447 aes${dir} $rndkey1,$inout0
448 aes${dir} $rndkey1,$inout1
449 aes${dir} $rndkey1,$inout2
450 aes${dir} $rndkey1,$inout3
451 aes${dir}last $rndkey0,$inout0
452 aes${dir}last $rndkey0,$inout1
453 aes${dir}last $rndkey0,$inout2
454 aes${dir}last $rndkey0,$inout3
457 .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
460 sub aesni_generate6 {
462 # As already mentioned it takes in $key and $rounds, which are *not*
463 # preserved. $inout[0-5] is cipher/clear text...
465 .type _aesni_${dir}rypt6,\@abi-omnipotent
469 $movkey ($key),$rndkey0
471 $movkey 16($key),$rndkey1
472 xorps $rndkey0,$inout0
473 pxor $rndkey0,$inout1
474 pxor $rndkey0,$inout2
475 aes${dir} $rndkey1,$inout0
476 lea 32($key,$rounds),$key
478 aes${dir} $rndkey1,$inout1
479 pxor $rndkey0,$inout3
480 pxor $rndkey0,$inout4
481 aes${dir} $rndkey1,$inout2
482 pxor $rndkey0,$inout5
483 $movkey ($key,%rax),$rndkey0
485 jmp .L${dir}_loop6_enter
488 aes${dir} $rndkey1,$inout0
489 aes${dir} $rndkey1,$inout1
490 aes${dir} $rndkey1,$inout2
491 .L${dir}_loop6_enter:
492 aes${dir} $rndkey1,$inout3
493 aes${dir} $rndkey1,$inout4
494 aes${dir} $rndkey1,$inout5
495 $movkey ($key,%rax),$rndkey1
497 aes${dir} $rndkey0,$inout0
498 aes${dir} $rndkey0,$inout1
499 aes${dir} $rndkey0,$inout2
500 aes${dir} $rndkey0,$inout3
501 aes${dir} $rndkey0,$inout4
502 aes${dir} $rndkey0,$inout5
503 $movkey -16($key,%rax),$rndkey0
506 aes${dir} $rndkey1,$inout0
507 aes${dir} $rndkey1,$inout1
508 aes${dir} $rndkey1,$inout2
509 aes${dir} $rndkey1,$inout3
510 aes${dir} $rndkey1,$inout4
511 aes${dir} $rndkey1,$inout5
512 aes${dir}last $rndkey0,$inout0
513 aes${dir}last $rndkey0,$inout1
514 aes${dir}last $rndkey0,$inout2
515 aes${dir}last $rndkey0,$inout3
516 aes${dir}last $rndkey0,$inout4
517 aes${dir}last $rndkey0,$inout5
520 .size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
523 sub aesni_generate8 {
525 # As already mentioned it takes in $key and $rounds, which are *not*
526 # preserved. $inout[0-7] is cipher/clear text...
528 .type _aesni_${dir}rypt8,\@abi-omnipotent
532 $movkey ($key),$rndkey0
534 $movkey 16($key),$rndkey1
535 xorps $rndkey0,$inout0
536 xorps $rndkey0,$inout1
537 pxor $rndkey0,$inout2
538 pxor $rndkey0,$inout3
539 pxor $rndkey0,$inout4
540 lea 32($key,$rounds),$key
542 aes${dir} $rndkey1,$inout0
543 pxor $rndkey0,$inout5
544 pxor $rndkey0,$inout6
545 aes${dir} $rndkey1,$inout1
546 pxor $rndkey0,$inout7
547 $movkey ($key,%rax),$rndkey0
549 jmp .L${dir}_loop8_inner
552 aes${dir} $rndkey1,$inout0
553 aes${dir} $rndkey1,$inout1
554 .L${dir}_loop8_inner:
555 aes${dir} $rndkey1,$inout2
556 aes${dir} $rndkey1,$inout3
557 aes${dir} $rndkey1,$inout4
558 aes${dir} $rndkey1,$inout5
559 aes${dir} $rndkey1,$inout6
560 aes${dir} $rndkey1,$inout7
561 .L${dir}_loop8_enter:
562 $movkey ($key,%rax),$rndkey1
564 aes${dir} $rndkey0,$inout0
565 aes${dir} $rndkey0,$inout1
566 aes${dir} $rndkey0,$inout2
567 aes${dir} $rndkey0,$inout3
568 aes${dir} $rndkey0,$inout4
569 aes${dir} $rndkey0,$inout5
570 aes${dir} $rndkey0,$inout6
571 aes${dir} $rndkey0,$inout7
572 $movkey -16($key,%rax),$rndkey0
575 aes${dir} $rndkey1,$inout0
576 aes${dir} $rndkey1,$inout1
577 aes${dir} $rndkey1,$inout2
578 aes${dir} $rndkey1,$inout3
579 aes${dir} $rndkey1,$inout4
580 aes${dir} $rndkey1,$inout5
581 aes${dir} $rndkey1,$inout6
582 aes${dir} $rndkey1,$inout7
583 aes${dir}last $rndkey0,$inout0
584 aes${dir}last $rndkey0,$inout1
585 aes${dir}last $rndkey0,$inout2
586 aes${dir}last $rndkey0,$inout3
587 aes${dir}last $rndkey0,$inout4
588 aes${dir}last $rndkey0,$inout5
589 aes${dir}last $rndkey0,$inout6
590 aes${dir}last $rndkey0,$inout7
593 .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
596 &aesni_generate2("enc") if ($PREFIX eq "aesni");
597 &aesni_generate2("dec");
598 &aesni_generate3("enc") if ($PREFIX eq "aesni");
599 &aesni_generate3("dec");
600 &aesni_generate4("enc") if ($PREFIX eq "aesni");
601 &aesni_generate4("dec");
602 &aesni_generate6("enc") if ($PREFIX eq "aesni");
603 &aesni_generate6("dec");
604 &aesni_generate8("enc") if ($PREFIX eq "aesni");
605 &aesni_generate8("dec");
607 if ($PREFIX eq "aesni") {
608 ########################################################################
609 # void aesni_ecb_encrypt (const void *in, void *out,
610 # size_t length, const AES_KEY *key,
613 .globl aesni_ecb_encrypt
614 .type aesni_ecb_encrypt,\@function,5
619 $code.=<<___ if ($win64);
621 movaps %xmm6,(%rsp) # offload $inout4..7
622 movaps %xmm7,0x10(%rsp)
623 movaps %xmm8,0x20(%rsp)
624 movaps %xmm9,0x30(%rsp)
628 and \$-16,$len # if ($len<16)
629 jz .Lecb_ret # return
631 mov 240($key),$rounds # key->rounds
632 $movkey ($key),$rndkey0
633 mov $key,$key_ # backup $key
634 mov $rounds,$rnds_ # backup $rounds
635 test %r8d,%r8d # 5th argument
637 #--------------------------- ECB ENCRYPT ------------------------------#
638 cmp \$0x80,$len # if ($len<8*16)
639 jb .Lecb_enc_tail # short input
641 movdqu ($inp),$inout0 # load 8 input blocks
642 movdqu 0x10($inp),$inout1
643 movdqu 0x20($inp),$inout2
644 movdqu 0x30($inp),$inout3
645 movdqu 0x40($inp),$inout4
646 movdqu 0x50($inp),$inout5
647 movdqu 0x60($inp),$inout6
648 movdqu 0x70($inp),$inout7
649 lea 0x80($inp),$inp # $inp+=8*16
650 sub \$0x80,$len # $len-=8*16 (can be zero)
651 jmp .Lecb_enc_loop8_enter
654 movups $inout0,($out) # store 8 output blocks
655 mov $key_,$key # restore $key
656 movdqu ($inp),$inout0 # load 8 input blocks
657 mov $rnds_,$rounds # restore $rounds
658 movups $inout1,0x10($out)
659 movdqu 0x10($inp),$inout1
660 movups $inout2,0x20($out)
661 movdqu 0x20($inp),$inout2
662 movups $inout3,0x30($out)
663 movdqu 0x30($inp),$inout3
664 movups $inout4,0x40($out)
665 movdqu 0x40($inp),$inout4
666 movups $inout5,0x50($out)
667 movdqu 0x50($inp),$inout5
668 movups $inout6,0x60($out)
669 movdqu 0x60($inp),$inout6
670 movups $inout7,0x70($out)
671 lea 0x80($out),$out # $out+=8*16
672 movdqu 0x70($inp),$inout7
673 lea 0x80($inp),$inp # $inp+=8*16
674 .Lecb_enc_loop8_enter:
679 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow
681 movups $inout0,($out) # store 8 output blocks
682 mov $key_,$key # restore $key
683 movups $inout1,0x10($out)
684 mov $rnds_,$rounds # restore $rounds
685 movups $inout2,0x20($out)
686 movups $inout3,0x30($out)
687 movups $inout4,0x40($out)
688 movups $inout5,0x50($out)
689 movups $inout6,0x60($out)
690 movups $inout7,0x70($out)
691 lea 0x80($out),$out # $out+=8*16
692 add \$0x80,$len # restore real remaining $len
693 jz .Lecb_ret # done if ($len==0)
695 .Lecb_enc_tail: # $len is less than 8*16
696 movups ($inp),$inout0
699 movups 0x10($inp),$inout1
701 movups 0x20($inp),$inout2
704 movups 0x30($inp),$inout3
706 movups 0x40($inp),$inout4
709 movups 0x50($inp),$inout5
711 movdqu 0x60($inp),$inout6
712 xorps $inout7,$inout7
714 movups $inout0,($out) # store 7 output blocks
715 movups $inout1,0x10($out)
716 movups $inout2,0x20($out)
717 movups $inout3,0x30($out)
718 movups $inout4,0x40($out)
719 movups $inout5,0x50($out)
720 movups $inout6,0x60($out)
725 &aesni_generate1("enc",$key,$rounds);
727 movups $inout0,($out) # store one output block
732 movups $inout0,($out) # store 2 output blocks
733 movups $inout1,0x10($out)
738 movups $inout0,($out) # store 3 output blocks
739 movups $inout1,0x10($out)
740 movups $inout2,0x20($out)
745 movups $inout0,($out) # store 4 output blocks
746 movups $inout1,0x10($out)
747 movups $inout2,0x20($out)
748 movups $inout3,0x30($out)
752 xorps $inout5,$inout5
754 movups $inout0,($out) # store 5 output blocks
755 movups $inout1,0x10($out)
756 movups $inout2,0x20($out)
757 movups $inout3,0x30($out)
758 movups $inout4,0x40($out)
763 movups $inout0,($out) # store 6 output blocks
764 movups $inout1,0x10($out)
765 movups $inout2,0x20($out)
766 movups $inout3,0x30($out)
767 movups $inout4,0x40($out)
768 movups $inout5,0x50($out)
770 \f#--------------------------- ECB DECRYPT ------------------------------#
773 cmp \$0x80,$len # if ($len<8*16)
774 jb .Lecb_dec_tail # short input
776 movdqu ($inp),$inout0 # load 8 input blocks
777 movdqu 0x10($inp),$inout1
778 movdqu 0x20($inp),$inout2
779 movdqu 0x30($inp),$inout3
780 movdqu 0x40($inp),$inout4
781 movdqu 0x50($inp),$inout5
782 movdqu 0x60($inp),$inout6
783 movdqu 0x70($inp),$inout7
784 lea 0x80($inp),$inp # $inp+=8*16
785 sub \$0x80,$len # $len-=8*16 (can be zero)
786 jmp .Lecb_dec_loop8_enter
789 movups $inout0,($out) # store 8 output blocks
790 mov $key_,$key # restore $key
791 movdqu ($inp),$inout0 # load 8 input blocks
792 mov $rnds_,$rounds # restore $rounds
793 movups $inout1,0x10($out)
794 movdqu 0x10($inp),$inout1
795 movups $inout2,0x20($out)
796 movdqu 0x20($inp),$inout2
797 movups $inout3,0x30($out)
798 movdqu 0x30($inp),$inout3
799 movups $inout4,0x40($out)
800 movdqu 0x40($inp),$inout4
801 movups $inout5,0x50($out)
802 movdqu 0x50($inp),$inout5
803 movups $inout6,0x60($out)
804 movdqu 0x60($inp),$inout6
805 movups $inout7,0x70($out)
806 lea 0x80($out),$out # $out+=8*16
807 movdqu 0x70($inp),$inout7
808 lea 0x80($inp),$inp # $inp+=8*16
809 .Lecb_dec_loop8_enter:
813 $movkey ($key_),$rndkey0
815 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow
817 movups $inout0,($out) # store 8 output blocks
818 pxor $inout0,$inout0 # clear register bank
819 mov $key_,$key # restore $key
820 movups $inout1,0x10($out)
822 mov $rnds_,$rounds # restore $rounds
823 movups $inout2,0x20($out)
825 movups $inout3,0x30($out)
827 movups $inout4,0x40($out)
829 movups $inout5,0x50($out)
831 movups $inout6,0x60($out)
833 movups $inout7,0x70($out)
835 lea 0x80($out),$out # $out+=8*16
836 add \$0x80,$len # restore real remaining $len
837 jz .Lecb_ret # done if ($len==0)
840 movups ($inp),$inout0
843 movups 0x10($inp),$inout1
845 movups 0x20($inp),$inout2
848 movups 0x30($inp),$inout3
850 movups 0x40($inp),$inout4
853 movups 0x50($inp),$inout5
855 movups 0x60($inp),$inout6
856 $movkey ($key),$rndkey0
857 xorps $inout7,$inout7
859 movups $inout0,($out) # store 7 output blocks
860 pxor $inout0,$inout0 # clear register bank
861 movups $inout1,0x10($out)
863 movups $inout2,0x20($out)
865 movups $inout3,0x30($out)
867 movups $inout4,0x40($out)
869 movups $inout5,0x50($out)
871 movups $inout6,0x60($out)
878 &aesni_generate1("dec",$key,$rounds);
880 movups $inout0,($out) # store one output block
881 pxor $inout0,$inout0 # clear register bank
886 movups $inout0,($out) # store 2 output blocks
887 pxor $inout0,$inout0 # clear register bank
888 movups $inout1,0x10($out)
894 movups $inout0,($out) # store 3 output blocks
895 pxor $inout0,$inout0 # clear register bank
896 movups $inout1,0x10($out)
898 movups $inout2,0x20($out)
904 movups $inout0,($out) # store 4 output blocks
905 pxor $inout0,$inout0 # clear register bank
906 movups $inout1,0x10($out)
908 movups $inout2,0x20($out)
910 movups $inout3,0x30($out)
915 xorps $inout5,$inout5
917 movups $inout0,($out) # store 5 output blocks
918 pxor $inout0,$inout0 # clear register bank
919 movups $inout1,0x10($out)
921 movups $inout2,0x20($out)
923 movups $inout3,0x30($out)
925 movups $inout4,0x40($out)
932 movups $inout0,($out) # store 6 output blocks
933 pxor $inout0,$inout0 # clear register bank
934 movups $inout1,0x10($out)
936 movups $inout2,0x20($out)
938 movups $inout3,0x30($out)
940 movups $inout4,0x40($out)
942 movups $inout5,0x50($out)
946 xorps $rndkey0,$rndkey0 # %xmm0
947 pxor $rndkey1,$rndkey1
949 $code.=<<___ if ($win64);
951 movaps %xmm0,(%rsp) # clear stack
952 movaps 0x10(%rsp),%xmm7
953 movaps %xmm0,0x10(%rsp)
954 movaps 0x20(%rsp),%xmm8
955 movaps %xmm0,0x20(%rsp)
956 movaps 0x30(%rsp),%xmm9
957 movaps %xmm0,0x30(%rsp)
964 .size aesni_ecb_encrypt,.-aesni_ecb_encrypt
968 ######################################################################
969 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
970 # size_t blocks, const AES_KEY *key,
971 # const char *ivec,char *cmac);
973 # Handles only complete blocks, operates on 64-bit counter and
974 # does not update *ivec! Nor does it finalize CMAC value
975 # (see engine/eng_aesni.c for details)
978 my $cmac="%r9"; # 6th argument
980 my $increment="%xmm9";
982 my $bswap_mask="%xmm7";
985 .globl aesni_ccm64_encrypt_blocks
986 .type aesni_ccm64_encrypt_blocks,\@function,6
988 aesni_ccm64_encrypt_blocks:
991 $code.=<<___ if ($win64);
993 movaps %xmm6,(%rsp) # $iv
994 movaps %xmm7,0x10(%rsp) # $bswap_mask
995 movaps %xmm8,0x20(%rsp) # $in0
996 movaps %xmm9,0x30(%rsp) # $increment
1000 mov 240($key),$rounds # key->rounds
1002 movdqa .Lincrement64(%rip),$increment
1003 movdqa .Lbswap_mask(%rip),$bswap_mask
1008 movdqu ($cmac),$inout1
1010 lea 32($key,$rounds),$key # end of key schedule
1011 pshufb $bswap_mask,$iv
1012 sub %rax,%r10 # twisted $rounds
1013 jmp .Lccm64_enc_outer
1016 $movkey ($key_),$rndkey0
1018 movups ($inp),$in0 # load inp
1020 xorps $rndkey0,$inout0 # counter
1021 $movkey 16($key_),$rndkey1
1023 xorps $rndkey0,$inout1 # cmac^=inp
1024 $movkey 32($key_),$rndkey0
1027 aesenc $rndkey1,$inout0
1028 aesenc $rndkey1,$inout1
1029 $movkey ($key,%rax),$rndkey1
1031 aesenc $rndkey0,$inout0
1032 aesenc $rndkey0,$inout1
1033 $movkey -16($key,%rax),$rndkey0
1034 jnz .Lccm64_enc2_loop
1035 aesenc $rndkey1,$inout0
1036 aesenc $rndkey1,$inout1
1037 paddq $increment,$iv
1038 dec $len # $len-- ($len is in blocks)
1039 aesenclast $rndkey0,$inout0
1040 aesenclast $rndkey0,$inout1
1043 xorps $inout0,$in0 # inp ^= E(iv)
1045 movups $in0,($out) # save output
1046 pshufb $bswap_mask,$inout0
1047 lea 16($out),$out # $out+=16
1048 jnz .Lccm64_enc_outer # loop if ($len!=0)
1050 pxor $rndkey0,$rndkey0 # clear register bank
1051 pxor $rndkey1,$rndkey1
1052 pxor $inout0,$inout0
1053 movups $inout1,($cmac) # store resulting mac
1054 pxor $inout1,$inout1
1058 $code.=<<___ if ($win64);
1060 movaps %xmm0,(%rsp) # clear stack
1061 movaps 0x10(%rsp),%xmm7
1062 movaps %xmm0,0x10(%rsp)
1063 movaps 0x20(%rsp),%xmm8
1064 movaps %xmm0,0x20(%rsp)
1065 movaps 0x30(%rsp),%xmm9
1066 movaps %xmm0,0x30(%rsp)
1073 .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
1075 ######################################################################
1077 .globl aesni_ccm64_decrypt_blocks
1078 .type aesni_ccm64_decrypt_blocks,\@function,6
1080 aesni_ccm64_decrypt_blocks:
1083 $code.=<<___ if ($win64);
1084 lea -0x58(%rsp),%rsp
1085 movaps %xmm6,(%rsp) # $iv
1086 movaps %xmm7,0x10(%rsp) # $bswap_mask
1087 movaps %xmm8,0x20(%rsp) # $in8
1088 movaps %xmm9,0x30(%rsp) # $increment
1092 mov 240($key),$rounds # key->rounds
1094 movdqu ($cmac),$inout1
1095 movdqa .Lincrement64(%rip),$increment
1096 movdqa .Lbswap_mask(%rip),$bswap_mask
1101 pshufb $bswap_mask,$iv
1103 &aesni_generate1("enc",$key,$rounds);
1107 movups ($inp),$in0 # load inp
1108 paddq $increment,$iv
1109 lea 16($inp),$inp # $inp+=16
1110 sub %r10,%rax # twisted $rounds
1111 lea 32($key_,$rnds_),$key # end of key schedule
1113 jmp .Lccm64_dec_outer
1116 xorps $inout0,$in0 # inp ^= E(iv)
1118 movups $in0,($out) # save output
1119 lea 16($out),$out # $out+=16
1120 pshufb $bswap_mask,$inout0
1122 sub \$1,$len # $len-- ($len is in blocks)
1123 jz .Lccm64_dec_break # if ($len==0) break
1125 $movkey ($key_),$rndkey0
1127 $movkey 16($key_),$rndkey1
1129 xorps $rndkey0,$inout0
1130 xorps $in0,$inout1 # cmac^=out
1131 $movkey 32($key_),$rndkey0
1132 jmp .Lccm64_dec2_loop
1135 aesenc $rndkey1,$inout0
1136 aesenc $rndkey1,$inout1
1137 $movkey ($key,%rax),$rndkey1
1139 aesenc $rndkey0,$inout0
1140 aesenc $rndkey0,$inout1
1141 $movkey -16($key,%rax),$rndkey0
1142 jnz .Lccm64_dec2_loop
1143 movups ($inp),$in0 # load input
1144 paddq $increment,$iv
1145 aesenc $rndkey1,$inout0
1146 aesenc $rndkey1,$inout1
1147 aesenclast $rndkey0,$inout0
1148 aesenclast $rndkey0,$inout1
1149 lea 16($inp),$inp # $inp+=16
1150 jmp .Lccm64_dec_outer
1154 #xorps $in0,$inout1 # cmac^=out
1155 mov 240($key_),$rounds
1157 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
1159 pxor $rndkey0,$rndkey0 # clear register bank
1160 pxor $rndkey1,$rndkey1
1161 pxor $inout0,$inout0
1162 movups $inout1,($cmac) # store resulting mac
1163 pxor $inout1,$inout1
1167 $code.=<<___ if ($win64);
1169 movaps %xmm0,(%rsp) # clear stack
1170 movaps 0x10(%rsp),%xmm7
1171 movaps %xmm0,0x10(%rsp)
1172 movaps 0x20(%rsp),%xmm8
1173 movaps %xmm0,0x20(%rsp)
1174 movaps 0x30(%rsp),%xmm9
1175 movaps %xmm0,0x30(%rsp)
1182 .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
1185 ######################################################################
1186 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1187 # size_t blocks, const AES_KEY *key,
1188 # const char *ivec);
1190 # Handles only complete blocks, operates on 32-bit counter and
1191 # does not update *ivec! (see crypto/modes/ctr128.c for details)
1193 # Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
1194 # http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
1195 # Keywords are full unroll and modulo-schedule counter calculations
1196 # with zero-round key xor.
1198 my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
1199 my ($key0,$ctr)=("%ebp","${ivp}d");
1200 my $frame_size = 0x80 + ($win64?160:0);
1203 .globl aesni_ctr32_encrypt_blocks
1204 .type aesni_ctr32_encrypt_blocks,\@function,5
1206 aesni_ctr32_encrypt_blocks:
1211 # handle single block without allocating stack frame,
1212 # useful when handling edges
1213 movups ($ivp),$inout0
1214 movups ($inp),$inout1
1215 mov 240($key),%edx # key->rounds
1217 &aesni_generate1("enc",$key,"%edx");
1219 pxor $rndkey0,$rndkey0 # clear register bank
1220 pxor $rndkey1,$rndkey1
1221 xorps $inout1,$inout0
1222 pxor $inout1,$inout1
1223 movups $inout0,($out)
1224 xorps $inout0,$inout0
1225 jmp .Lctr32_epilogue
1229 lea (%rsp),$key_ # use $key_ as frame pointer
1230 .cfi_def_cfa_register $key_
1233 sub \$$frame_size,%rsp
1234 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
1236 $code.=<<___ if ($win64);
1237 movaps %xmm6,-0xa8($key_) # offload everything
1238 movaps %xmm7,-0x98($key_)
1239 movaps %xmm8,-0x88($key_)
1240 movaps %xmm9,-0x78($key_)
1241 movaps %xmm10,-0x68($key_)
1242 movaps %xmm11,-0x58($key_)
1243 movaps %xmm12,-0x48($key_)
1244 movaps %xmm13,-0x38($key_)
1245 movaps %xmm14,-0x28($key_)
1246 movaps %xmm15,-0x18($key_)
1251 # 8 16-byte words on top of stack are counter values
1252 # xor-ed with zero-round key
1254 movdqu ($ivp),$inout0
1255 movdqu ($key),$rndkey0
1256 mov 12($ivp),$ctr # counter LSB
1257 pxor $rndkey0,$inout0
1258 mov 12($key),$key0 # 0-round key LSB
1259 movdqa $inout0,0x00(%rsp) # populate counter block
1261 movdqa $inout0,$inout1
1262 movdqa $inout0,$inout2
1263 movdqa $inout0,$inout3
1264 movdqa $inout0,0x40(%rsp)
1265 movdqa $inout0,0x50(%rsp)
1266 movdqa $inout0,0x60(%rsp)
1267 mov %rdx,%r10 # about to borrow %rdx
1268 movdqa $inout0,0x70(%rsp)
1276 pinsrd \$3,%eax,$inout1
1278 movdqa $inout1,0x10(%rsp)
1279 pinsrd \$3,%edx,$inout2
1281 mov %r10,%rdx # restore %rdx
1283 movdqa $inout2,0x20(%rsp)
1286 pinsrd \$3,%eax,$inout3
1288 movdqa $inout3,0x30(%rsp)
1290 mov %r10d,0x40+12(%rsp)
1293 mov 240($key),$rounds # key->rounds
1296 mov %r9d,0x50+12(%rsp)
1299 mov %r10d,0x60+12(%rsp)
1301 mov OPENSSL_ia32cap_P+4(%rip),%r10d
1303 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
1304 mov %r9d,0x70+12(%rsp)
1306 $movkey 0x10($key),$rndkey1
1308 movdqa 0x40(%rsp),$inout4
1309 movdqa 0x50(%rsp),$inout5
1311 cmp \$8,$len # $len is in blocks
1312 jb .Lctr32_tail # short input if ($len<8)
1314 sub \$6,$len # $len is biased by -6
1315 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
1316 je .Lctr32_6x # [which denotes Atom Silvermont]
1318 lea 0x80($key),$key # size optimization
1319 sub \$2,$len # $len is biased by -8
1327 lea 32($key,$rounds),$key # end of key schedule
1328 sub %rax,%r10 # twisted $rounds
1333 add \$6,$ctr # next counter value
1334 $movkey -48($key,$rnds_),$rndkey0
1335 aesenc $rndkey1,$inout0
1338 aesenc $rndkey1,$inout1
1339 movbe %eax,`0x00+12`(%rsp) # store next counter value
1341 aesenc $rndkey1,$inout2
1343 movbe %eax,`0x10+12`(%rsp)
1344 aesenc $rndkey1,$inout3
1347 aesenc $rndkey1,$inout4
1348 movbe %eax,`0x20+12`(%rsp)
1350 aesenc $rndkey1,$inout5
1351 $movkey -32($key,$rnds_),$rndkey1
1354 aesenc $rndkey0,$inout0
1355 movbe %eax,`0x30+12`(%rsp)
1357 aesenc $rndkey0,$inout1
1359 movbe %eax,`0x40+12`(%rsp)
1360 aesenc $rndkey0,$inout2
1363 aesenc $rndkey0,$inout3
1364 movbe %eax,`0x50+12`(%rsp)
1365 mov %r10,%rax # mov $rnds_,$rounds
1366 aesenc $rndkey0,$inout4
1367 aesenc $rndkey0,$inout5
1368 $movkey -16($key,$rnds_),$rndkey0
1372 movdqu ($inp),$inout6 # load 6 input blocks
1373 movdqu 0x10($inp),$inout7
1374 movdqu 0x20($inp),$in0
1375 movdqu 0x30($inp),$in1
1376 movdqu 0x40($inp),$in2
1377 movdqu 0x50($inp),$in3
1378 lea 0x60($inp),$inp # $inp+=6*16
1379 $movkey -64($key,$rnds_),$rndkey1
1380 pxor $inout0,$inout6 # inp^=E(ctr)
1381 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
1382 pxor $inout1,$inout7
1383 movaps 0x10(%rsp),$inout1
1385 movaps 0x20(%rsp),$inout2
1387 movaps 0x30(%rsp),$inout3
1389 movaps 0x40(%rsp),$inout4
1391 movaps 0x50(%rsp),$inout5
1392 movdqu $inout6,($out) # store 6 output blocks
1393 movdqu $inout7,0x10($out)
1394 movdqu $in0,0x20($out)
1395 movdqu $in1,0x30($out)
1396 movdqu $in2,0x40($out)
1397 movdqu $in3,0x50($out)
1398 lea 0x60($out),$out # $out+=6*16
1401 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
1403 add \$6,$len # restore real remaining $len
1404 jz .Lctr32_done # done if ($len==0)
1406 lea -48($rnds_),$rounds
1407 lea -80($key,$rnds_),$key # restore $key
1409 shr \$4,$rounds # restore $rounds
1414 add \$8,$ctr # next counter value
1415 movdqa 0x60(%rsp),$inout6
1416 aesenc $rndkey1,$inout0
1418 movdqa 0x70(%rsp),$inout7
1419 aesenc $rndkey1,$inout1
1421 $movkey 0x20-0x80($key),$rndkey0
1422 aesenc $rndkey1,$inout2
1425 aesenc $rndkey1,$inout3
1426 mov %r9d,0x00+12(%rsp) # store next counter value
1428 aesenc $rndkey1,$inout4
1429 aesenc $rndkey1,$inout5
1430 aesenc $rndkey1,$inout6
1431 aesenc $rndkey1,$inout7
1432 $movkey 0x30-0x80($key),$rndkey1
1434 for($i=2;$i<8;$i++) {
1435 my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1438 aesenc $rndkeyx,$inout0
1439 aesenc $rndkeyx,$inout1
1442 aesenc $rndkeyx,$inout2
1443 aesenc $rndkeyx,$inout3
1444 mov %r9d,`0x10*($i-1)`+12(%rsp)
1446 aesenc $rndkeyx,$inout4
1447 aesenc $rndkeyx,$inout5
1448 aesenc $rndkeyx,$inout6
1449 aesenc $rndkeyx,$inout7
1450 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
1455 aesenc $rndkey0,$inout0
1456 aesenc $rndkey0,$inout1
1457 aesenc $rndkey0,$inout2
1459 movdqu 0x00($inp),$in0 # start loading input
1460 aesenc $rndkey0,$inout3
1461 mov %r9d,0x70+12(%rsp)
1463 aesenc $rndkey0,$inout4
1464 aesenc $rndkey0,$inout5
1465 aesenc $rndkey0,$inout6
1466 aesenc $rndkey0,$inout7
1467 $movkey 0xa0-0x80($key),$rndkey0
1471 aesenc $rndkey1,$inout0
1472 aesenc $rndkey1,$inout1
1473 aesenc $rndkey1,$inout2
1474 aesenc $rndkey1,$inout3
1475 aesenc $rndkey1,$inout4
1476 aesenc $rndkey1,$inout5
1477 aesenc $rndkey1,$inout6
1478 aesenc $rndkey1,$inout7
1479 $movkey 0xb0-0x80($key),$rndkey1
1481 aesenc $rndkey0,$inout0
1482 aesenc $rndkey0,$inout1
1483 aesenc $rndkey0,$inout2
1484 aesenc $rndkey0,$inout3
1485 aesenc $rndkey0,$inout4
1486 aesenc $rndkey0,$inout5
1487 aesenc $rndkey0,$inout6
1488 aesenc $rndkey0,$inout7
1489 $movkey 0xc0-0x80($key),$rndkey0
1492 aesenc $rndkey1,$inout0
1493 aesenc $rndkey1,$inout1
1494 aesenc $rndkey1,$inout2
1495 aesenc $rndkey1,$inout3
1496 aesenc $rndkey1,$inout4
1497 aesenc $rndkey1,$inout5
1498 aesenc $rndkey1,$inout6
1499 aesenc $rndkey1,$inout7
1500 $movkey 0xd0-0x80($key),$rndkey1
1502 aesenc $rndkey0,$inout0
1503 aesenc $rndkey0,$inout1
1504 aesenc $rndkey0,$inout2
1505 aesenc $rndkey0,$inout3
1506 aesenc $rndkey0,$inout4
1507 aesenc $rndkey0,$inout5
1508 aesenc $rndkey0,$inout6
1509 aesenc $rndkey0,$inout7
1510 $movkey 0xe0-0x80($key),$rndkey0
1511 jmp .Lctr32_enc_done
1515 movdqu 0x10($inp),$in1
1516 pxor $rndkey0,$in0 # input^=round[last]
1517 movdqu 0x20($inp),$in2
1519 movdqu 0x30($inp),$in3
1521 movdqu 0x40($inp),$in4
1523 movdqu 0x50($inp),$in5
1526 aesenc $rndkey1,$inout0
1527 aesenc $rndkey1,$inout1
1528 aesenc $rndkey1,$inout2
1529 aesenc $rndkey1,$inout3
1530 aesenc $rndkey1,$inout4
1531 aesenc $rndkey1,$inout5
1532 aesenc $rndkey1,$inout6
1533 aesenc $rndkey1,$inout7
1534 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6]
1535 lea 0x80($inp),$inp # $inp+=8*16
1537 aesenclast $in0,$inout0 # $inN is inp[N]^round[last]
1538 pxor $rndkey0,$rndkey1 # borrowed $rndkey
1539 movdqu 0x70-0x80($inp),$in0
1540 aesenclast $in1,$inout1
1542 movdqa 0x00(%rsp),$in1 # load next counter block
1543 aesenclast $in2,$inout2
1544 aesenclast $in3,$inout3
1545 movdqa 0x10(%rsp),$in2
1546 movdqa 0x20(%rsp),$in3
1547 aesenclast $in4,$inout4
1548 aesenclast $in5,$inout5
1549 movdqa 0x30(%rsp),$in4
1550 movdqa 0x40(%rsp),$in5
1551 aesenclast $rndkey1,$inout6
1552 movdqa 0x50(%rsp),$rndkey0
1553 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key
1554 aesenclast $in0,$inout7
1556 movups $inout0,($out) # store 8 output blocks
1558 movups $inout1,0x10($out)
1560 movups $inout2,0x20($out)
1562 movups $inout3,0x30($out)
1564 movups $inout4,0x40($out)
1566 movups $inout5,0x50($out)
1567 movdqa $rndkey0,$inout5
1568 movups $inout6,0x60($out)
1569 movups $inout7,0x70($out)
1570 lea 0x80($out),$out # $out+=8*16
1573 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
1575 add \$8,$len # restore real remaining $len
1576 jz .Lctr32_done # done if ($len==0)
1577 lea -0x80($key),$key
1580 # note that at this point $inout0..5 are populated with
1581 # counter values xor-ed with 0-round key
1587 # if ($len>4) compute 7 E(counter)
1589 movdqa 0x60(%rsp),$inout6
1590 pxor $inout7,$inout7
1592 $movkey 16($key),$rndkey0
1593 aesenc $rndkey1,$inout0
1594 aesenc $rndkey1,$inout1
1595 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
1597 aesenc $rndkey1,$inout2
1598 add \$16,%rax # prepare for .Lenc_loop8_enter
1600 aesenc $rndkey1,$inout3
1601 aesenc $rndkey1,$inout4
1602 movups 0x10($inp),$in1 # pre-load input
1603 movups 0x20($inp),$in2
1604 aesenc $rndkey1,$inout5
1605 aesenc $rndkey1,$inout6
1607 call .Lenc_loop8_enter
1609 movdqu 0x30($inp),$in3
1611 movdqu 0x40($inp),$in0
1613 movdqu $inout0,($out) # store output
1615 movdqu $inout1,0x10($out)
1617 movdqu $inout2,0x20($out)
1619 movdqu $inout3,0x30($out)
1620 movdqu $inout4,0x40($out)
1622 jb .Lctr32_done # $len was 5, stop store
1624 movups 0x50($inp),$in1
1626 movups $inout5,0x50($out)
1627 je .Lctr32_done # $len was 6, stop store
1629 movups 0x60($inp),$in2
1631 movups $inout6,0x60($out)
1632 jmp .Lctr32_done # $len was 7, stop store
1636 aesenc $rndkey1,$inout0
1639 aesenc $rndkey1,$inout1
1640 aesenc $rndkey1,$inout2
1641 aesenc $rndkey1,$inout3
1642 $movkey ($key),$rndkey1
1644 aesenclast $rndkey1,$inout0
1645 aesenclast $rndkey1,$inout1
1646 movups ($inp),$in0 # load input
1647 movups 0x10($inp),$in1
1648 aesenclast $rndkey1,$inout2
1649 aesenclast $rndkey1,$inout3
1650 movups 0x20($inp),$in2
1651 movups 0x30($inp),$in3
1654 movups $inout0,($out) # store output
1656 movups $inout1,0x10($out)
1658 movdqu $inout2,0x20($out)
1660 movdqu $inout3,0x30($out)
1661 jmp .Lctr32_done # $len was 4, stop store
1665 aesenc $rndkey1,$inout0
1668 aesenc $rndkey1,$inout1
1669 aesenc $rndkey1,$inout2
1670 $movkey ($key),$rndkey1
1672 aesenclast $rndkey1,$inout0
1673 aesenclast $rndkey1,$inout1
1674 aesenclast $rndkey1,$inout2
1676 movups ($inp),$in0 # load input
1678 movups $inout0,($out) # store output
1680 jb .Lctr32_done # $len was 1, stop store
1682 movups 0x10($inp),$in1
1684 movups $inout1,0x10($out)
1685 je .Lctr32_done # $len was 2, stop store
1687 movups 0x20($inp),$in2
1689 movups $inout2,0x20($out) # $len was 3, stop store
1692 xorps %xmm0,%xmm0 # clear register bank
1700 $code.=<<___ if (!$win64);
1703 movaps %xmm0,0x00(%rsp) # clear stack
1705 movaps %xmm0,0x10(%rsp)
1707 movaps %xmm0,0x20(%rsp)
1709 movaps %xmm0,0x30(%rsp)
1711 movaps %xmm0,0x40(%rsp)
1713 movaps %xmm0,0x50(%rsp)
1715 movaps %xmm0,0x60(%rsp)
1717 movaps %xmm0,0x70(%rsp)
1720 $code.=<<___ if ($win64);
1721 movaps -0xa8($key_),%xmm6
1722 movaps %xmm0,-0xa8($key_) # clear stack
1723 movaps -0x98($key_),%xmm7
1724 movaps %xmm0,-0x98($key_)
1725 movaps -0x88($key_),%xmm8
1726 movaps %xmm0,-0x88($key_)
1727 movaps -0x78($key_),%xmm9
1728 movaps %xmm0,-0x78($key_)
1729 movaps -0x68($key_),%xmm10
1730 movaps %xmm0,-0x68($key_)
1731 movaps -0x58($key_),%xmm11
1732 movaps %xmm0,-0x58($key_)
1733 movaps -0x48($key_),%xmm12
1734 movaps %xmm0,-0x48($key_)
1735 movaps -0x38($key_),%xmm13
1736 movaps %xmm0,-0x38($key_)
1737 movaps -0x28($key_),%xmm14
1738 movaps %xmm0,-0x28($key_)
1739 movaps -0x18($key_),%xmm15
1740 movaps %xmm0,-0x18($key_)
1741 movaps %xmm0,0x00(%rsp)
1742 movaps %xmm0,0x10(%rsp)
1743 movaps %xmm0,0x20(%rsp)
1744 movaps %xmm0,0x30(%rsp)
1745 movaps %xmm0,0x40(%rsp)
1746 movaps %xmm0,0x50(%rsp)
1747 movaps %xmm0,0x60(%rsp)
1748 movaps %xmm0,0x70(%rsp)
1754 .cfi_def_cfa_register %rsp
1758 .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1762 ######################################################################
1763 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1764 # const AES_KEY *key1, const AES_KEY *key2
1765 # const unsigned char iv[16]);
1768 my @tweak=map("%xmm$_",(10..15));
1769 my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1770 my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1771 my $frame_size = 0x70 + ($win64?160:0);
1772 my $key_ = "%rbp"; # override so that we can use %r11 as FP
1775 .globl aesni_xts_encrypt
1776 .type aesni_xts_encrypt,\@function,6
1780 lea (%rsp),%r11 # frame pointer
1781 .cfi_def_cfa_register %r11
1784 sub \$$frame_size,%rsp
1785 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
1787 $code.=<<___ if ($win64);
1788 movaps %xmm6,-0xa8(%r11) # offload everything
1789 movaps %xmm7,-0x98(%r11)
1790 movaps %xmm8,-0x88(%r11)
1791 movaps %xmm9,-0x78(%r11)
1792 movaps %xmm10,-0x68(%r11)
1793 movaps %xmm11,-0x58(%r11)
1794 movaps %xmm12,-0x48(%r11)
1795 movaps %xmm13,-0x38(%r11)
1796 movaps %xmm14,-0x28(%r11)
1797 movaps %xmm15,-0x18(%r11)
1801 movups ($ivp),$inout0 # load clear-text tweak
1802 mov 240(%r8),$rounds # key2->rounds
1803 mov 240($key),$rnds_ # key1->rounds
1805 # generate the tweak
1806 &aesni_generate1("enc",$key2,$rounds,$inout0);
1808 $movkey ($key),$rndkey0 # zero round key
1809 mov $key,$key_ # backup $key
1810 mov $rnds_,$rounds # backup $rounds
1812 mov $len,$len_ # backup $len
1815 $movkey 16($key,$rnds_),$rndkey1 # last round key
1817 movdqa .Lxts_magic(%rip),$twmask
1818 movdqa $inout0,@tweak[5]
1819 pshufd \$0x5f,$inout0,$twres
1820 pxor $rndkey0,$rndkey1
1822 # alternative tweak calculation algorithm is based on suggestions
1823 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
1824 # and should help in the future...
1825 for ($i=0;$i<4;$i++) {
1827 movdqa $twres,$twtmp
1829 movdqa @tweak[5],@tweak[$i]
1830 psrad \$31,$twtmp # broadcast upper bits
1831 paddq @tweak[5],@tweak[5]
1833 pxor $rndkey0,@tweak[$i]
1834 pxor $twtmp,@tweak[5]
1838 movdqa @tweak[5],@tweak[4]
1840 paddq @tweak[5],@tweak[5]
1842 pxor $rndkey0,@tweak[4]
1843 pxor $twres,@tweak[5]
1844 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
1847 jc .Lxts_enc_short # if $len-=6*16 borrowed
1850 lea 32($key_,$rnds_),$key # end of key schedule
1851 sub %r10,%rax # twisted $rounds
1852 $movkey 16($key_),$rndkey1
1853 mov %rax,%r10 # backup twisted $rounds
1854 lea .Lxts_magic(%rip),%r8
1855 jmp .Lxts_enc_grandloop
1858 .Lxts_enc_grandloop:
1859 movdqu `16*0`($inp),$inout0 # load input
1860 movdqa $rndkey0,$twmask
1861 movdqu `16*1`($inp),$inout1
1862 pxor @tweak[0],$inout0 # input^=tweak^round[0]
1863 movdqu `16*2`($inp),$inout2
1864 pxor @tweak[1],$inout1
1865 aesenc $rndkey1,$inout0
1866 movdqu `16*3`($inp),$inout3
1867 pxor @tweak[2],$inout2
1868 aesenc $rndkey1,$inout1
1869 movdqu `16*4`($inp),$inout4
1870 pxor @tweak[3],$inout3
1871 aesenc $rndkey1,$inout2
1872 movdqu `16*5`($inp),$inout5
1873 pxor @tweak[5],$twmask # round[0]^=tweak[5]
1874 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
1875 pxor @tweak[4],$inout4
1876 aesenc $rndkey1,$inout3
1877 $movkey 32($key_),$rndkey0
1878 lea `16*6`($inp),$inp
1879 pxor $twmask,$inout5
1881 pxor $twres,@tweak[0] # calculate tweaks^round[last]
1882 aesenc $rndkey1,$inout4
1883 pxor $twres,@tweak[1]
1884 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last]
1885 aesenc $rndkey1,$inout5
1886 $movkey 48($key_),$rndkey1
1887 pxor $twres,@tweak[2]
1889 aesenc $rndkey0,$inout0
1890 pxor $twres,@tweak[3]
1891 movdqa @tweak[1],`16*1`(%rsp)
1892 aesenc $rndkey0,$inout1
1893 pxor $twres,@tweak[4]
1894 movdqa @tweak[2],`16*2`(%rsp)
1895 aesenc $rndkey0,$inout2
1896 aesenc $rndkey0,$inout3
1898 movdqa @tweak[4],`16*4`(%rsp)
1899 aesenc $rndkey0,$inout4
1900 aesenc $rndkey0,$inout5
1901 $movkey 64($key_),$rndkey0
1902 movdqa $twmask,`16*5`(%rsp)
1903 pshufd \$0x5f,@tweak[5],$twres
1907 aesenc $rndkey1,$inout0
1908 aesenc $rndkey1,$inout1
1909 aesenc $rndkey1,$inout2
1910 aesenc $rndkey1,$inout3
1911 aesenc $rndkey1,$inout4
1912 aesenc $rndkey1,$inout5
1913 $movkey -64($key,%rax),$rndkey1
1916 aesenc $rndkey0,$inout0
1917 aesenc $rndkey0,$inout1
1918 aesenc $rndkey0,$inout2
1919 aesenc $rndkey0,$inout3
1920 aesenc $rndkey0,$inout4
1921 aesenc $rndkey0,$inout5
1922 $movkey -80($key,%rax),$rndkey0
1925 movdqa (%r8),$twmask # start calculating next tweak
1926 movdqa $twres,$twtmp
1928 aesenc $rndkey1,$inout0
1929 paddq @tweak[5],@tweak[5]
1931 aesenc $rndkey1,$inout1
1933 $movkey ($key_),@tweak[0] # load round[0]
1934 aesenc $rndkey1,$inout2
1935 aesenc $rndkey1,$inout3
1936 aesenc $rndkey1,$inout4
1937 pxor $twtmp,@tweak[5]
1938 movaps @tweak[0],@tweak[1] # copy round[0]
1939 aesenc $rndkey1,$inout5
1940 $movkey -64($key),$rndkey1
1942 movdqa $twres,$twtmp
1943 aesenc $rndkey0,$inout0
1945 pxor @tweak[5],@tweak[0]
1946 aesenc $rndkey0,$inout1
1948 paddq @tweak[5],@tweak[5]
1949 aesenc $rndkey0,$inout2
1950 aesenc $rndkey0,$inout3
1952 movaps @tweak[1],@tweak[2]
1953 aesenc $rndkey0,$inout4
1954 pxor $twtmp,@tweak[5]
1955 movdqa $twres,$twtmp
1956 aesenc $rndkey0,$inout5
1957 $movkey -48($key),$rndkey0
1960 aesenc $rndkey1,$inout0
1961 pxor @tweak[5],@tweak[1]
1963 aesenc $rndkey1,$inout1
1964 paddq @tweak[5],@tweak[5]
1966 aesenc $rndkey1,$inout2
1967 aesenc $rndkey1,$inout3
1968 movdqa @tweak[3],`16*3`(%rsp)
1969 pxor $twtmp,@tweak[5]
1970 aesenc $rndkey1,$inout4
1971 movaps @tweak[2],@tweak[3]
1972 movdqa $twres,$twtmp
1973 aesenc $rndkey1,$inout5
1974 $movkey -32($key),$rndkey1
1977 aesenc $rndkey0,$inout0
1978 pxor @tweak[5],@tweak[2]
1980 aesenc $rndkey0,$inout1
1981 paddq @tweak[5],@tweak[5]
1983 aesenc $rndkey0,$inout2
1984 aesenc $rndkey0,$inout3
1985 aesenc $rndkey0,$inout4
1986 pxor $twtmp,@tweak[5]
1987 movaps @tweak[3],@tweak[4]
1988 aesenc $rndkey0,$inout5
1990 movdqa $twres,$rndkey0
1992 aesenc $rndkey1,$inout0
1993 pxor @tweak[5],@tweak[3]
1995 aesenc $rndkey1,$inout1
1996 paddq @tweak[5],@tweak[5]
1997 pand $twmask,$rndkey0
1998 aesenc $rndkey1,$inout2
1999 aesenc $rndkey1,$inout3
2000 pxor $rndkey0,@tweak[5]
2001 $movkey ($key_),$rndkey0
2002 aesenc $rndkey1,$inout4
2003 aesenc $rndkey1,$inout5
2004 $movkey 16($key_),$rndkey1
2006 pxor @tweak[5],@tweak[4]
2007 aesenclast `16*0`(%rsp),$inout0
2009 paddq @tweak[5],@tweak[5]
2010 aesenclast `16*1`(%rsp),$inout1
2011 aesenclast `16*2`(%rsp),$inout2
2013 mov %r10,%rax # restore $rounds
2014 aesenclast `16*3`(%rsp),$inout3
2015 aesenclast `16*4`(%rsp),$inout4
2016 aesenclast `16*5`(%rsp),$inout5
2017 pxor $twres,@tweak[5]
2019 lea `16*6`($out),$out # $out+=6*16
2020 movups $inout0,`-16*6`($out) # store 6 output blocks
2021 movups $inout1,`-16*5`($out)
2022 movups $inout2,`-16*4`($out)
2023 movups $inout3,`-16*3`($out)
2024 movups $inout4,`-16*2`($out)
2025 movups $inout5,`-16*1`($out)
2027 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow
2031 mov $key_,$key # restore $key
2032 shr \$4,$rounds # restore original value
2035 # at the point @tweak[0..5] are populated with tweak values
2036 mov $rounds,$rnds_ # backup $rounds
2037 pxor $rndkey0,@tweak[0]
2038 add \$16*6,$len # restore real remaining $len
2039 jz .Lxts_enc_done # done if ($len==0)
2041 pxor $rndkey0,@tweak[1]
2043 jb .Lxts_enc_one # $len is 1*16
2044 pxor $rndkey0,@tweak[2]
2045 je .Lxts_enc_two # $len is 2*16
2047 pxor $rndkey0,@tweak[3]
2049 jb .Lxts_enc_three # $len is 3*16
2050 pxor $rndkey0,@tweak[4]
2051 je .Lxts_enc_four # $len is 4*16
2053 movdqu ($inp),$inout0 # $len is 5*16
2054 movdqu 16*1($inp),$inout1
2055 movdqu 16*2($inp),$inout2
2056 pxor @tweak[0],$inout0
2057 movdqu 16*3($inp),$inout3
2058 pxor @tweak[1],$inout1
2059 movdqu 16*4($inp),$inout4
2060 lea 16*5($inp),$inp # $inp+=5*16
2061 pxor @tweak[2],$inout2
2062 pxor @tweak[3],$inout3
2063 pxor @tweak[4],$inout4
2064 pxor $inout5,$inout5
2066 call _aesni_encrypt6
2068 xorps @tweak[0],$inout0
2069 movdqa @tweak[5],@tweak[0]
2070 xorps @tweak[1],$inout1
2071 xorps @tweak[2],$inout2
2072 movdqu $inout0,($out) # store 5 output blocks
2073 xorps @tweak[3],$inout3
2074 movdqu $inout1,16*1($out)
2075 xorps @tweak[4],$inout4
2076 movdqu $inout2,16*2($out)
2077 movdqu $inout3,16*3($out)
2078 movdqu $inout4,16*4($out)
2079 lea 16*5($out),$out # $out+=5*16
2084 movups ($inp),$inout0
2085 lea 16*1($inp),$inp # inp+=1*16
2086 xorps @tweak[0],$inout0
2088 &aesni_generate1("enc",$key,$rounds);
2090 xorps @tweak[0],$inout0
2091 movdqa @tweak[1],@tweak[0]
2092 movups $inout0,($out) # store one output block
2093 lea 16*1($out),$out # $out+=1*16
2098 movups ($inp),$inout0
2099 movups 16($inp),$inout1
2100 lea 32($inp),$inp # $inp+=2*16
2101 xorps @tweak[0],$inout0
2102 xorps @tweak[1],$inout1
2104 call _aesni_encrypt2
2106 xorps @tweak[0],$inout0
2107 movdqa @tweak[2],@tweak[0]
2108 xorps @tweak[1],$inout1
2109 movups $inout0,($out) # store 2 output blocks
2110 movups $inout1,16*1($out)
2111 lea 16*2($out),$out # $out+=2*16
2116 movups ($inp),$inout0
2117 movups 16*1($inp),$inout1
2118 movups 16*2($inp),$inout2
2119 lea 16*3($inp),$inp # $inp+=3*16
2120 xorps @tweak[0],$inout0
2121 xorps @tweak[1],$inout1
2122 xorps @tweak[2],$inout2
2124 call _aesni_encrypt3
2126 xorps @tweak[0],$inout0
2127 movdqa @tweak[3],@tweak[0]
2128 xorps @tweak[1],$inout1
2129 xorps @tweak[2],$inout2
2130 movups $inout0,($out) # store 3 output blocks
2131 movups $inout1,16*1($out)
2132 movups $inout2,16*2($out)
2133 lea 16*3($out),$out # $out+=3*16
2138 movups ($inp),$inout0
2139 movups 16*1($inp),$inout1
2140 movups 16*2($inp),$inout2
2141 xorps @tweak[0],$inout0
2142 movups 16*3($inp),$inout3
2143 lea 16*4($inp),$inp # $inp+=4*16
2144 xorps @tweak[1],$inout1
2145 xorps @tweak[2],$inout2
2146 xorps @tweak[3],$inout3
2148 call _aesni_encrypt4
2150 pxor @tweak[0],$inout0
2151 movdqa @tweak[4],@tweak[0]
2152 pxor @tweak[1],$inout1
2153 pxor @tweak[2],$inout2
2154 movdqu $inout0,($out) # store 4 output blocks
2155 pxor @tweak[3],$inout3
2156 movdqu $inout1,16*1($out)
2157 movdqu $inout2,16*2($out)
2158 movdqu $inout3,16*3($out)
2159 lea 16*4($out),$out # $out+=4*16
2164 and \$15,$len_ # see if $len%16 is 0
2169 movzb ($inp),%eax # borrow $rounds ...
2170 movzb -16($out),%ecx # ... and $key
2178 sub $len_,$out # rewind $out
2179 mov $key_,$key # restore $key
2180 mov $rnds_,$rounds # restore $rounds
2182 movups -16($out),$inout0
2183 xorps @tweak[0],$inout0
2185 &aesni_generate1("enc",$key,$rounds);
2187 xorps @tweak[0],$inout0
2188 movups $inout0,-16($out)
2191 xorps %xmm0,%xmm0 # clear register bank
2198 $code.=<<___ if (!$win64);
2201 movaps %xmm0,0x00(%rsp) # clear stack
2203 movaps %xmm0,0x10(%rsp)
2205 movaps %xmm0,0x20(%rsp)
2207 movaps %xmm0,0x30(%rsp)
2209 movaps %xmm0,0x40(%rsp)
2211 movaps %xmm0,0x50(%rsp)
2213 movaps %xmm0,0x60(%rsp)
2217 $code.=<<___ if ($win64);
2218 movaps -0xa8(%r11),%xmm6
2219 movaps %xmm0,-0xa8(%r11) # clear stack
2220 movaps -0x98(%r11),%xmm7
2221 movaps %xmm0,-0x98(%r11)
2222 movaps -0x88(%r11),%xmm8
2223 movaps %xmm0,-0x88(%r11)
2224 movaps -0x78(%r11),%xmm9
2225 movaps %xmm0,-0x78(%r11)
2226 movaps -0x68(%r11),%xmm10
2227 movaps %xmm0,-0x68(%r11)
2228 movaps -0x58(%r11),%xmm11
2229 movaps %xmm0,-0x58(%r11)
2230 movaps -0x48(%r11),%xmm12
2231 movaps %xmm0,-0x48(%r11)
2232 movaps -0x38(%r11),%xmm13
2233 movaps %xmm0,-0x38(%r11)
2234 movaps -0x28(%r11),%xmm14
2235 movaps %xmm0,-0x28(%r11)
2236 movaps -0x18(%r11),%xmm15
2237 movaps %xmm0,-0x18(%r11)
2238 movaps %xmm0,0x00(%rsp)
2239 movaps %xmm0,0x10(%rsp)
2240 movaps %xmm0,0x20(%rsp)
2241 movaps %xmm0,0x30(%rsp)
2242 movaps %xmm0,0x40(%rsp)
2243 movaps %xmm0,0x50(%rsp)
2244 movaps %xmm0,0x60(%rsp)
2250 .cfi_def_cfa_register %rsp
2254 .size aesni_xts_encrypt,.-aesni_xts_encrypt
2258 .globl aesni_xts_decrypt
2259 .type aesni_xts_decrypt,\@function,6
2263 lea (%rsp),%r11 # frame pointer
2264 .cfi_def_cfa_register %r11
2267 sub \$$frame_size,%rsp
2268 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
2270 $code.=<<___ if ($win64);
2271 movaps %xmm6,-0xa8(%r11) # offload everything
2272 movaps %xmm7,-0x98(%r11)
2273 movaps %xmm8,-0x88(%r11)
2274 movaps %xmm9,-0x78(%r11)
2275 movaps %xmm10,-0x68(%r11)
2276 movaps %xmm11,-0x58(%r11)
2277 movaps %xmm12,-0x48(%r11)
2278 movaps %xmm13,-0x38(%r11)
2279 movaps %xmm14,-0x28(%r11)
2280 movaps %xmm15,-0x18(%r11)
2284 movups ($ivp),$inout0 # load clear-text tweak
2285 mov 240($key2),$rounds # key2->rounds
2286 mov 240($key),$rnds_ # key1->rounds
2288 # generate the tweak
2289 &aesni_generate1("enc",$key2,$rounds,$inout0);
2291 xor %eax,%eax # if ($len%16) len-=16;
2297 $movkey ($key),$rndkey0 # zero round key
2298 mov $key,$key_ # backup $key
2299 mov $rnds_,$rounds # backup $rounds
2301 mov $len,$len_ # backup $len
2304 $movkey 16($key,$rnds_),$rndkey1 # last round key
2306 movdqa .Lxts_magic(%rip),$twmask
2307 movdqa $inout0,@tweak[5]
2308 pshufd \$0x5f,$inout0,$twres
2309 pxor $rndkey0,$rndkey1
2311 for ($i=0;$i<4;$i++) {
2313 movdqa $twres,$twtmp
2315 movdqa @tweak[5],@tweak[$i]
2316 psrad \$31,$twtmp # broadcast upper bits
2317 paddq @tweak[5],@tweak[5]
2319 pxor $rndkey0,@tweak[$i]
2320 pxor $twtmp,@tweak[5]
2324 movdqa @tweak[5],@tweak[4]
2326 paddq @tweak[5],@tweak[5]
2328 pxor $rndkey0,@tweak[4]
2329 pxor $twres,@tweak[5]
2330 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
2333 jc .Lxts_dec_short # if $len-=6*16 borrowed
2336 lea 32($key_,$rnds_),$key # end of key schedule
2337 sub %r10,%rax # twisted $rounds
2338 $movkey 16($key_),$rndkey1
2339 mov %rax,%r10 # backup twisted $rounds
2340 lea .Lxts_magic(%rip),%r8
2341 jmp .Lxts_dec_grandloop
2344 .Lxts_dec_grandloop:
2345 movdqu `16*0`($inp),$inout0 # load input
2346 movdqa $rndkey0,$twmask
2347 movdqu `16*1`($inp),$inout1
2348 pxor @tweak[0],$inout0 # input^=tweak^round[0]
2349 movdqu `16*2`($inp),$inout2
2350 pxor @tweak[1],$inout1
2351 aesdec $rndkey1,$inout0
2352 movdqu `16*3`($inp),$inout3
2353 pxor @tweak[2],$inout2
2354 aesdec $rndkey1,$inout1
2355 movdqu `16*4`($inp),$inout4
2356 pxor @tweak[3],$inout3
2357 aesdec $rndkey1,$inout2
2358 movdqu `16*5`($inp),$inout5
2359 pxor @tweak[5],$twmask # round[0]^=tweak[5]
2360 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
2361 pxor @tweak[4],$inout4
2362 aesdec $rndkey1,$inout3
2363 $movkey 32($key_),$rndkey0
2364 lea `16*6`($inp),$inp
2365 pxor $twmask,$inout5
2367 pxor $twres,@tweak[0] # calculate tweaks^round[last]
2368 aesdec $rndkey1,$inout4
2369 pxor $twres,@tweak[1]
2370 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
2371 aesdec $rndkey1,$inout5
2372 $movkey 48($key_),$rndkey1
2373 pxor $twres,@tweak[2]
2375 aesdec $rndkey0,$inout0
2376 pxor $twres,@tweak[3]
2377 movdqa @tweak[1],`16*1`(%rsp)
2378 aesdec $rndkey0,$inout1
2379 pxor $twres,@tweak[4]
2380 movdqa @tweak[2],`16*2`(%rsp)
2381 aesdec $rndkey0,$inout2
2382 aesdec $rndkey0,$inout3
2384 movdqa @tweak[4],`16*4`(%rsp)
2385 aesdec $rndkey0,$inout4
2386 aesdec $rndkey0,$inout5
2387 $movkey 64($key_),$rndkey0
2388 movdqa $twmask,`16*5`(%rsp)
2389 pshufd \$0x5f,@tweak[5],$twres
2393 aesdec $rndkey1,$inout0
2394 aesdec $rndkey1,$inout1
2395 aesdec $rndkey1,$inout2
2396 aesdec $rndkey1,$inout3
2397 aesdec $rndkey1,$inout4
2398 aesdec $rndkey1,$inout5
2399 $movkey -64($key,%rax),$rndkey1
2402 aesdec $rndkey0,$inout0
2403 aesdec $rndkey0,$inout1
2404 aesdec $rndkey0,$inout2
2405 aesdec $rndkey0,$inout3
2406 aesdec $rndkey0,$inout4
2407 aesdec $rndkey0,$inout5
2408 $movkey -80($key,%rax),$rndkey0
2411 movdqa (%r8),$twmask # start calculating next tweak
2412 movdqa $twres,$twtmp
2414 aesdec $rndkey1,$inout0
2415 paddq @tweak[5],@tweak[5]
2417 aesdec $rndkey1,$inout1
2419 $movkey ($key_),@tweak[0] # load round[0]
2420 aesdec $rndkey1,$inout2
2421 aesdec $rndkey1,$inout3
2422 aesdec $rndkey1,$inout4
2423 pxor $twtmp,@tweak[5]
2424 movaps @tweak[0],@tweak[1] # copy round[0]
2425 aesdec $rndkey1,$inout5
2426 $movkey -64($key),$rndkey1
2428 movdqa $twres,$twtmp
2429 aesdec $rndkey0,$inout0
2431 pxor @tweak[5],@tweak[0]
2432 aesdec $rndkey0,$inout1
2434 paddq @tweak[5],@tweak[5]
2435 aesdec $rndkey0,$inout2
2436 aesdec $rndkey0,$inout3
2438 movaps @tweak[1],@tweak[2]
2439 aesdec $rndkey0,$inout4
2440 pxor $twtmp,@tweak[5]
2441 movdqa $twres,$twtmp
2442 aesdec $rndkey0,$inout5
2443 $movkey -48($key),$rndkey0
2446 aesdec $rndkey1,$inout0
2447 pxor @tweak[5],@tweak[1]
2449 aesdec $rndkey1,$inout1
2450 paddq @tweak[5],@tweak[5]
2452 aesdec $rndkey1,$inout2
2453 aesdec $rndkey1,$inout3
2454 movdqa @tweak[3],`16*3`(%rsp)
2455 pxor $twtmp,@tweak[5]
2456 aesdec $rndkey1,$inout4
2457 movaps @tweak[2],@tweak[3]
2458 movdqa $twres,$twtmp
2459 aesdec $rndkey1,$inout5
2460 $movkey -32($key),$rndkey1
2463 aesdec $rndkey0,$inout0
2464 pxor @tweak[5],@tweak[2]
2466 aesdec $rndkey0,$inout1
2467 paddq @tweak[5],@tweak[5]
2469 aesdec $rndkey0,$inout2
2470 aesdec $rndkey0,$inout3
2471 aesdec $rndkey0,$inout4
2472 pxor $twtmp,@tweak[5]
2473 movaps @tweak[3],@tweak[4]
2474 aesdec $rndkey0,$inout5
2476 movdqa $twres,$rndkey0
2478 aesdec $rndkey1,$inout0
2479 pxor @tweak[5],@tweak[3]
2481 aesdec $rndkey1,$inout1
2482 paddq @tweak[5],@tweak[5]
2483 pand $twmask,$rndkey0
2484 aesdec $rndkey1,$inout2
2485 aesdec $rndkey1,$inout3
2486 pxor $rndkey0,@tweak[5]
2487 $movkey ($key_),$rndkey0
2488 aesdec $rndkey1,$inout4
2489 aesdec $rndkey1,$inout5
2490 $movkey 16($key_),$rndkey1
2492 pxor @tweak[5],@tweak[4]
2493 aesdeclast `16*0`(%rsp),$inout0
2495 paddq @tweak[5],@tweak[5]
2496 aesdeclast `16*1`(%rsp),$inout1
2497 aesdeclast `16*2`(%rsp),$inout2
2499 mov %r10,%rax # restore $rounds
2500 aesdeclast `16*3`(%rsp),$inout3
2501 aesdeclast `16*4`(%rsp),$inout4
2502 aesdeclast `16*5`(%rsp),$inout5
2503 pxor $twres,@tweak[5]
2505 lea `16*6`($out),$out # $out+=6*16
2506 movups $inout0,`-16*6`($out) # store 6 output blocks
2507 movups $inout1,`-16*5`($out)
2508 movups $inout2,`-16*4`($out)
2509 movups $inout3,`-16*3`($out)
2510 movups $inout4,`-16*2`($out)
2511 movups $inout5,`-16*1`($out)
2513 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow
2517 mov $key_,$key # restore $key
2518 shr \$4,$rounds # restore original value
2521 # at the point @tweak[0..5] are populated with tweak values
2522 mov $rounds,$rnds_ # backup $rounds
2523 pxor $rndkey0,@tweak[0]
2524 pxor $rndkey0,@tweak[1]
2525 add \$16*6,$len # restore real remaining $len
2526 jz .Lxts_dec_done # done if ($len==0)
2528 pxor $rndkey0,@tweak[2]
2530 jb .Lxts_dec_one # $len is 1*16
2531 pxor $rndkey0,@tweak[3]
2532 je .Lxts_dec_two # $len is 2*16
2534 pxor $rndkey0,@tweak[4]
2536 jb .Lxts_dec_three # $len is 3*16
2537 je .Lxts_dec_four # $len is 4*16
2539 movdqu ($inp),$inout0 # $len is 5*16
2540 movdqu 16*1($inp),$inout1
2541 movdqu 16*2($inp),$inout2
2542 pxor @tweak[0],$inout0
2543 movdqu 16*3($inp),$inout3
2544 pxor @tweak[1],$inout1
2545 movdqu 16*4($inp),$inout4
2546 lea 16*5($inp),$inp # $inp+=5*16
2547 pxor @tweak[2],$inout2
2548 pxor @tweak[3],$inout3
2549 pxor @tweak[4],$inout4
2551 call _aesni_decrypt6
2553 xorps @tweak[0],$inout0
2554 xorps @tweak[1],$inout1
2555 xorps @tweak[2],$inout2
2556 movdqu $inout0,($out) # store 5 output blocks
2557 xorps @tweak[3],$inout3
2558 movdqu $inout1,16*1($out)
2559 xorps @tweak[4],$inout4
2560 movdqu $inout2,16*2($out)
2562 movdqu $inout3,16*3($out)
2563 pcmpgtd @tweak[5],$twtmp
2564 movdqu $inout4,16*4($out)
2565 lea 16*5($out),$out # $out+=5*16
2566 pshufd \$0x13,$twtmp,@tweak[1] # $twres
2570 movdqa @tweak[5],@tweak[0]
2571 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
2572 pand $twmask,@tweak[1] # isolate carry and residue
2573 pxor @tweak[5],@tweak[1]
2578 movups ($inp),$inout0
2579 lea 16*1($inp),$inp # $inp+=1*16
2580 xorps @tweak[0],$inout0
2582 &aesni_generate1("dec",$key,$rounds);
2584 xorps @tweak[0],$inout0
2585 movdqa @tweak[1],@tweak[0]
2586 movups $inout0,($out) # store one output block
2587 movdqa @tweak[2],@tweak[1]
2588 lea 16*1($out),$out # $out+=1*16
2593 movups ($inp),$inout0
2594 movups 16($inp),$inout1
2595 lea 32($inp),$inp # $inp+=2*16
2596 xorps @tweak[0],$inout0
2597 xorps @tweak[1],$inout1
2599 call _aesni_decrypt2
2601 xorps @tweak[0],$inout0
2602 movdqa @tweak[2],@tweak[0]
2603 xorps @tweak[1],$inout1
2604 movdqa @tweak[3],@tweak[1]
2605 movups $inout0,($out) # store 2 output blocks
2606 movups $inout1,16*1($out)
2607 lea 16*2($out),$out # $out+=2*16
2612 movups ($inp),$inout0
2613 movups 16*1($inp),$inout1
2614 movups 16*2($inp),$inout2
2615 lea 16*3($inp),$inp # $inp+=3*16
2616 xorps @tweak[0],$inout0
2617 xorps @tweak[1],$inout1
2618 xorps @tweak[2],$inout2
2620 call _aesni_decrypt3
2622 xorps @tweak[0],$inout0
2623 movdqa @tweak[3],@tweak[0]
2624 xorps @tweak[1],$inout1
2625 movdqa @tweak[4],@tweak[1]
2626 xorps @tweak[2],$inout2
2627 movups $inout0,($out) # store 3 output blocks
2628 movups $inout1,16*1($out)
2629 movups $inout2,16*2($out)
2630 lea 16*3($out),$out # $out+=3*16
2635 movups ($inp),$inout0
2636 movups 16*1($inp),$inout1
2637 movups 16*2($inp),$inout2
2638 xorps @tweak[0],$inout0
2639 movups 16*3($inp),$inout3
2640 lea 16*4($inp),$inp # $inp+=4*16
2641 xorps @tweak[1],$inout1
2642 xorps @tweak[2],$inout2
2643 xorps @tweak[3],$inout3
2645 call _aesni_decrypt4
2647 pxor @tweak[0],$inout0
2648 movdqa @tweak[4],@tweak[0]
2649 pxor @tweak[1],$inout1
2650 movdqa @tweak[5],@tweak[1]
2651 pxor @tweak[2],$inout2
2652 movdqu $inout0,($out) # store 4 output blocks
2653 pxor @tweak[3],$inout3
2654 movdqu $inout1,16*1($out)
2655 movdqu $inout2,16*2($out)
2656 movdqu $inout3,16*3($out)
2657 lea 16*4($out),$out # $out+=4*16
2662 and \$15,$len_ # see if $len%16 is 0
2666 mov $key_,$key # restore $key
2667 mov $rnds_,$rounds # restore $rounds
2669 movups ($inp),$inout0
2670 xorps @tweak[1],$inout0
2672 &aesni_generate1("dec",$key,$rounds);
2674 xorps @tweak[1],$inout0
2675 movups $inout0,($out)
2678 movzb 16($inp),%eax # borrow $rounds ...
2679 movzb ($out),%ecx # ... and $key
2687 sub $len_,$out # rewind $out
2688 mov $key_,$key # restore $key
2689 mov $rnds_,$rounds # restore $rounds
2691 movups ($out),$inout0
2692 xorps @tweak[0],$inout0
2694 &aesni_generate1("dec",$key,$rounds);
2696 xorps @tweak[0],$inout0
2697 movups $inout0,($out)
2700 xorps %xmm0,%xmm0 # clear register bank
2707 $code.=<<___ if (!$win64);
2710 movaps %xmm0,0x00(%rsp) # clear stack
2712 movaps %xmm0,0x10(%rsp)
2714 movaps %xmm0,0x20(%rsp)
2716 movaps %xmm0,0x30(%rsp)
2718 movaps %xmm0,0x40(%rsp)
2720 movaps %xmm0,0x50(%rsp)
2722 movaps %xmm0,0x60(%rsp)
2726 $code.=<<___ if ($win64);
2727 movaps -0xa8(%r11),%xmm6
2728 movaps %xmm0,-0xa8(%r11) # clear stack
2729 movaps -0x98(%r11),%xmm7
2730 movaps %xmm0,-0x98(%r11)
2731 movaps -0x88(%r11),%xmm8
2732 movaps %xmm0,-0x88(%r11)
2733 movaps -0x78(%r11),%xmm9
2734 movaps %xmm0,-0x78(%r11)
2735 movaps -0x68(%r11),%xmm10
2736 movaps %xmm0,-0x68(%r11)
2737 movaps -0x58(%r11),%xmm11
2738 movaps %xmm0,-0x58(%r11)
2739 movaps -0x48(%r11),%xmm12
2740 movaps %xmm0,-0x48(%r11)
2741 movaps -0x38(%r11),%xmm13
2742 movaps %xmm0,-0x38(%r11)
2743 movaps -0x28(%r11),%xmm14
2744 movaps %xmm0,-0x28(%r11)
2745 movaps -0x18(%r11),%xmm15
2746 movaps %xmm0,-0x18(%r11)
2747 movaps %xmm0,0x00(%rsp)
2748 movaps %xmm0,0x10(%rsp)
2749 movaps %xmm0,0x20(%rsp)
2750 movaps %xmm0,0x30(%rsp)
2751 movaps %xmm0,0x40(%rsp)
2752 movaps %xmm0,0x50(%rsp)
2753 movaps %xmm0,0x60(%rsp)
2759 .cfi_def_cfa_register %rsp
2763 .size aesni_xts_decrypt,.-aesni_xts_decrypt
2767 ######################################################################
2768 # void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
2769 # const AES_KEY *key, unsigned int start_block_num,
2770 # unsigned char offset_i[16], const unsigned char L_[][16],
2771 # unsigned char checksum[16]);
2774 my @offset=map("%xmm$_",(10..15));
2775 my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
2776 my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments
2777 my ($L_p,$checksum_p) = ("%rbx","%rbp");
2778 my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
2779 my $seventh_arg = $win64 ? 56 : 8;
2783 .globl aesni_ocb_encrypt
2784 .type aesni_ocb_encrypt,\@function,6
2800 $code.=<<___ if ($win64);
2801 lea -0xa0(%rsp),%rsp
2802 movaps %xmm6,0x00(%rsp) # offload everything
2803 movaps %xmm7,0x10(%rsp)
2804 movaps %xmm8,0x20(%rsp)
2805 movaps %xmm9,0x30(%rsp)
2806 movaps %xmm10,0x40(%rsp)
2807 movaps %xmm11,0x50(%rsp)
2808 movaps %xmm12,0x60(%rsp)
2809 movaps %xmm13,0x70(%rsp)
2810 movaps %xmm14,0x80(%rsp)
2811 movaps %xmm15,0x90(%rsp)
2815 mov $seventh_arg(%rax),$L_p # 7th argument
2816 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
2818 mov 240($key),$rnds_
2821 $movkey ($key),$rndkey0l # round[0]
2822 $movkey 16($key,$rnds_),$rndkey1 # round[last]
2824 movdqu ($offset_p),@offset[5] # load last offset_i
2825 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
2826 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
2829 lea 32($key_,$rnds_),$key
2830 $movkey 16($key_),$rndkey1 # round[1]
2831 sub %r10,%rax # twisted $rounds
2832 mov %rax,%r10 # backup twisted $rounds
2834 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
2835 movdqu ($checksum_p),$checksum # load checksum
2837 test \$1,$block_num # is first block number odd?
2843 movdqu ($L_p,$i1),$inout5 # borrow
2844 movdqu ($inp),$inout0
2849 movdqa $inout5,@offset[5]
2850 movups $inout0,($out)
2856 lea 1($block_num),$i1 # even-numbered blocks
2857 lea 3($block_num),$i3
2858 lea 5($block_num),$i5
2859 lea 6($block_num),$block_num
2860 bsf $i1,$i1 # ntz(block)
2863 shl \$4,$i1 # ntz(block) -> table offset
2869 jmp .Locb_enc_grandloop
2872 .Locb_enc_grandloop:
2873 movdqu `16*0`($inp),$inout0 # load input
2874 movdqu `16*1`($inp),$inout1
2875 movdqu `16*2`($inp),$inout2
2876 movdqu `16*3`($inp),$inout3
2877 movdqu `16*4`($inp),$inout4
2878 movdqu `16*5`($inp),$inout5
2879 lea `16*6`($inp),$inp
2883 movups $inout0,`16*0`($out) # store output
2884 movups $inout1,`16*1`($out)
2885 movups $inout2,`16*2`($out)
2886 movups $inout3,`16*3`($out)
2887 movups $inout4,`16*4`($out)
2888 movups $inout5,`16*5`($out)
2889 lea `16*6`($out),$out
2891 jnc .Locb_enc_grandloop
2897 movdqu `16*0`($inp),$inout0
2900 movdqu `16*1`($inp),$inout1
2903 movdqu `16*2`($inp),$inout2
2906 movdqu `16*3`($inp),$inout3
2909 movdqu `16*4`($inp),$inout4
2910 pxor $inout5,$inout5
2914 movdqa @offset[4],@offset[5]
2915 movups $inout0,`16*0`($out)
2916 movups $inout1,`16*1`($out)
2917 movups $inout2,`16*2`($out)
2918 movups $inout3,`16*3`($out)
2919 movups $inout4,`16*4`($out)
2925 movdqa @offset[0],$inout5 # borrow
2929 movdqa $inout5,@offset[5]
2930 movups $inout0,`16*0`($out)
2935 pxor $inout2,$inout2
2936 pxor $inout3,$inout3
2940 movdqa @offset[1],@offset[5]
2941 movups $inout0,`16*0`($out)
2942 movups $inout1,`16*1`($out)
2948 pxor $inout3,$inout3
2952 movdqa @offset[2],@offset[5]
2953 movups $inout0,`16*0`($out)
2954 movups $inout1,`16*1`($out)
2955 movups $inout2,`16*2`($out)
2963 movdqa @offset[3],@offset[5]
2964 movups $inout0,`16*0`($out)
2965 movups $inout1,`16*1`($out)
2966 movups $inout2,`16*2`($out)
2967 movups $inout3,`16*3`($out)
2970 pxor $rndkey0,@offset[5] # "remove" round[last]
2971 movdqu $checksum,($checksum_p) # store checksum
2972 movdqu @offset[5],($offset_p) # store last offset_i
2974 xorps %xmm0,%xmm0 # clear register bank
2981 $code.=<<___ if (!$win64);
2995 $code.=<<___ if ($win64);
2996 movaps 0x00(%rsp),%xmm6
2997 movaps %xmm0,0x00(%rsp) # clear stack
2998 movaps 0x10(%rsp),%xmm7
2999 movaps %xmm0,0x10(%rsp)
3000 movaps 0x20(%rsp),%xmm8
3001 movaps %xmm0,0x20(%rsp)
3002 movaps 0x30(%rsp),%xmm9
3003 movaps %xmm0,0x30(%rsp)
3004 movaps 0x40(%rsp),%xmm10
3005 movaps %xmm0,0x40(%rsp)
3006 movaps 0x50(%rsp),%xmm11
3007 movaps %xmm0,0x50(%rsp)
3008 movaps 0x60(%rsp),%xmm12
3009 movaps %xmm0,0x60(%rsp)
3010 movaps 0x70(%rsp),%xmm13
3011 movaps %xmm0,0x70(%rsp)
3012 movaps 0x80(%rsp),%xmm14
3013 movaps %xmm0,0x80(%rsp)
3014 movaps 0x90(%rsp),%xmm15
3015 movaps %xmm0,0x90(%rsp)
3016 lea 0xa0+0x28(%rsp),%rax
3031 .cfi_def_cfa_register %rsp
3035 .size aesni_ocb_encrypt,.-aesni_ocb_encrypt
3037 .type __ocb_encrypt6,\@abi-omnipotent
3041 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3042 movdqu ($L_p,$i1),@offset[1]
3043 movdqa @offset[0],@offset[2]
3044 movdqu ($L_p,$i3),@offset[3]
3045 movdqa @offset[0],@offset[4]
3046 pxor @offset[5],@offset[0]
3047 movdqu ($L_p,$i5),@offset[5]
3048 pxor @offset[0],@offset[1]
3049 pxor $inout0,$checksum # accumulate checksum
3050 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3051 pxor @offset[1],@offset[2]
3052 pxor $inout1,$checksum
3053 pxor @offset[1],$inout1
3054 pxor @offset[2],@offset[3]
3055 pxor $inout2,$checksum
3056 pxor @offset[2],$inout2
3057 pxor @offset[3],@offset[4]
3058 pxor $inout3,$checksum
3059 pxor @offset[3],$inout3
3060 pxor @offset[4],@offset[5]
3061 pxor $inout4,$checksum
3062 pxor @offset[4],$inout4
3063 pxor $inout5,$checksum
3064 pxor @offset[5],$inout5
3065 $movkey 32($key_),$rndkey0
3067 lea 1($block_num),$i1 # even-numbered blocks
3068 lea 3($block_num),$i3
3069 lea 5($block_num),$i5
3071 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3072 bsf $i1,$i1 # ntz(block)
3076 aesenc $rndkey1,$inout0
3077 aesenc $rndkey1,$inout1
3078 aesenc $rndkey1,$inout2
3079 aesenc $rndkey1,$inout3
3080 pxor $rndkey0l,@offset[1]
3081 pxor $rndkey0l,@offset[2]
3082 aesenc $rndkey1,$inout4
3083 pxor $rndkey0l,@offset[3]
3084 pxor $rndkey0l,@offset[4]
3085 aesenc $rndkey1,$inout5
3086 $movkey 48($key_),$rndkey1
3087 pxor $rndkey0l,@offset[5]
3089 aesenc $rndkey0,$inout0
3090 aesenc $rndkey0,$inout1
3091 aesenc $rndkey0,$inout2
3092 aesenc $rndkey0,$inout3
3093 aesenc $rndkey0,$inout4
3094 aesenc $rndkey0,$inout5
3095 $movkey 64($key_),$rndkey0
3096 shl \$4,$i1 # ntz(block) -> table offset
3102 aesenc $rndkey1,$inout0
3103 aesenc $rndkey1,$inout1
3104 aesenc $rndkey1,$inout2
3105 aesenc $rndkey1,$inout3
3106 aesenc $rndkey1,$inout4
3107 aesenc $rndkey1,$inout5
3108 $movkey ($key,%rax),$rndkey1
3111 aesenc $rndkey0,$inout0
3112 aesenc $rndkey0,$inout1
3113 aesenc $rndkey0,$inout2
3114 aesenc $rndkey0,$inout3
3115 aesenc $rndkey0,$inout4
3116 aesenc $rndkey0,$inout5
3117 $movkey -16($key,%rax),$rndkey0
3120 aesenc $rndkey1,$inout0
3121 aesenc $rndkey1,$inout1
3122 aesenc $rndkey1,$inout2
3123 aesenc $rndkey1,$inout3
3124 aesenc $rndkey1,$inout4
3125 aesenc $rndkey1,$inout5
3126 $movkey 16($key_),$rndkey1
3129 aesenclast @offset[0],$inout0
3130 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3131 mov %r10,%rax # restore twisted rounds
3132 aesenclast @offset[1],$inout1
3133 aesenclast @offset[2],$inout2
3134 aesenclast @offset[3],$inout3
3135 aesenclast @offset[4],$inout4
3136 aesenclast @offset[5],$inout5
3139 .size __ocb_encrypt6,.-__ocb_encrypt6
3141 .type __ocb_encrypt4,\@abi-omnipotent
3145 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3146 movdqu ($L_p,$i1),@offset[1]
3147 movdqa @offset[0],@offset[2]
3148 movdqu ($L_p,$i3),@offset[3]
3149 pxor @offset[5],@offset[0]
3150 pxor @offset[0],@offset[1]
3151 pxor $inout0,$checksum # accumulate checksum
3152 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3153 pxor @offset[1],@offset[2]
3154 pxor $inout1,$checksum
3155 pxor @offset[1],$inout1
3156 pxor @offset[2],@offset[3]
3157 pxor $inout2,$checksum
3158 pxor @offset[2],$inout2
3159 pxor $inout3,$checksum
3160 pxor @offset[3],$inout3
3161 $movkey 32($key_),$rndkey0
3163 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3164 pxor $rndkey0l,@offset[1]
3165 pxor $rndkey0l,@offset[2]
3166 pxor $rndkey0l,@offset[3]
3168 aesenc $rndkey1,$inout0
3169 aesenc $rndkey1,$inout1
3170 aesenc $rndkey1,$inout2
3171 aesenc $rndkey1,$inout3
3172 $movkey 48($key_),$rndkey1
3174 aesenc $rndkey0,$inout0
3175 aesenc $rndkey0,$inout1
3176 aesenc $rndkey0,$inout2
3177 aesenc $rndkey0,$inout3
3178 $movkey 64($key_),$rndkey0
3183 aesenc $rndkey1,$inout0
3184 aesenc $rndkey1,$inout1
3185 aesenc $rndkey1,$inout2
3186 aesenc $rndkey1,$inout3
3187 $movkey ($key,%rax),$rndkey1
3190 aesenc $rndkey0,$inout0
3191 aesenc $rndkey0,$inout1
3192 aesenc $rndkey0,$inout2
3193 aesenc $rndkey0,$inout3
3194 $movkey -16($key,%rax),$rndkey0
3197 aesenc $rndkey1,$inout0
3198 aesenc $rndkey1,$inout1
3199 aesenc $rndkey1,$inout2
3200 aesenc $rndkey1,$inout3
3201 $movkey 16($key_),$rndkey1
3202 mov %r10,%rax # restore twisted rounds
3204 aesenclast @offset[0],$inout0
3205 aesenclast @offset[1],$inout1
3206 aesenclast @offset[2],$inout2
3207 aesenclast @offset[3],$inout3
3210 .size __ocb_encrypt4,.-__ocb_encrypt4
3212 .type __ocb_encrypt1,\@abi-omnipotent
3216 pxor @offset[5],$inout5 # offset_i
3217 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3218 pxor $inout0,$checksum # accumulate checksum
3219 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3220 $movkey 32($key_),$rndkey0
3222 aesenc $rndkey1,$inout0
3223 $movkey 48($key_),$rndkey1
3224 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3226 aesenc $rndkey0,$inout0
3227 $movkey 64($key_),$rndkey0
3232 aesenc $rndkey1,$inout0
3233 $movkey ($key,%rax),$rndkey1
3236 aesenc $rndkey0,$inout0
3237 $movkey -16($key,%rax),$rndkey0
3240 aesenc $rndkey1,$inout0
3241 $movkey 16($key_),$rndkey1 # redundant in tail
3242 mov %r10,%rax # restore twisted rounds
3244 aesenclast $inout5,$inout0
3247 .size __ocb_encrypt1,.-__ocb_encrypt1
3249 .globl aesni_ocb_decrypt
3250 .type aesni_ocb_decrypt,\@function,6
3266 $code.=<<___ if ($win64);
3267 lea -0xa0(%rsp),%rsp
3268 movaps %xmm6,0x00(%rsp) # offload everything
3269 movaps %xmm7,0x10(%rsp)
3270 movaps %xmm8,0x20(%rsp)
3271 movaps %xmm9,0x30(%rsp)
3272 movaps %xmm10,0x40(%rsp)
3273 movaps %xmm11,0x50(%rsp)
3274 movaps %xmm12,0x60(%rsp)
3275 movaps %xmm13,0x70(%rsp)
3276 movaps %xmm14,0x80(%rsp)
3277 movaps %xmm15,0x90(%rsp)
3281 mov $seventh_arg(%rax),$L_p # 7th argument
3282 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
3284 mov 240($key),$rnds_
3287 $movkey ($key),$rndkey0l # round[0]
3288 $movkey 16($key,$rnds_),$rndkey1 # round[last]
3290 movdqu ($offset_p),@offset[5] # load last offset_i
3291 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
3292 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
3295 lea 32($key_,$rnds_),$key
3296 $movkey 16($key_),$rndkey1 # round[1]
3297 sub %r10,%rax # twisted $rounds
3298 mov %rax,%r10 # backup twisted $rounds
3300 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3301 movdqu ($checksum_p),$checksum # load checksum
3303 test \$1,$block_num # is first block number odd?
3309 movdqu ($L_p,$i1),$inout5 # borrow
3310 movdqu ($inp),$inout0
3315 movdqa $inout5,@offset[5]
3316 movups $inout0,($out)
3317 xorps $inout0,$checksum # accumulate checksum
3323 lea 1($block_num),$i1 # even-numbered blocks
3324 lea 3($block_num),$i3
3325 lea 5($block_num),$i5
3326 lea 6($block_num),$block_num
3327 bsf $i1,$i1 # ntz(block)
3330 shl \$4,$i1 # ntz(block) -> table offset
3336 jmp .Locb_dec_grandloop
3339 .Locb_dec_grandloop:
3340 movdqu `16*0`($inp),$inout0 # load input
3341 movdqu `16*1`($inp),$inout1
3342 movdqu `16*2`($inp),$inout2
3343 movdqu `16*3`($inp),$inout3
3344 movdqu `16*4`($inp),$inout4
3345 movdqu `16*5`($inp),$inout5
3346 lea `16*6`($inp),$inp
3350 movups $inout0,`16*0`($out) # store output
3351 pxor $inout0,$checksum # accumulate checksum
3352 movups $inout1,`16*1`($out)
3353 pxor $inout1,$checksum
3354 movups $inout2,`16*2`($out)
3355 pxor $inout2,$checksum
3356 movups $inout3,`16*3`($out)
3357 pxor $inout3,$checksum
3358 movups $inout4,`16*4`($out)
3359 pxor $inout4,$checksum
3360 movups $inout5,`16*5`($out)
3361 pxor $inout5,$checksum
3362 lea `16*6`($out),$out
3364 jnc .Locb_dec_grandloop
3370 movdqu `16*0`($inp),$inout0
3373 movdqu `16*1`($inp),$inout1
3376 movdqu `16*2`($inp),$inout2
3379 movdqu `16*3`($inp),$inout3
3382 movdqu `16*4`($inp),$inout4
3383 pxor $inout5,$inout5
3387 movdqa @offset[4],@offset[5]
3388 movups $inout0,`16*0`($out) # store output
3389 pxor $inout0,$checksum # accumulate checksum
3390 movups $inout1,`16*1`($out)
3391 pxor $inout1,$checksum
3392 movups $inout2,`16*2`($out)
3393 pxor $inout2,$checksum
3394 movups $inout3,`16*3`($out)
3395 pxor $inout3,$checksum
3396 movups $inout4,`16*4`($out)
3397 pxor $inout4,$checksum
3403 movdqa @offset[0],$inout5 # borrow
3407 movdqa $inout5,@offset[5]
3408 movups $inout0,`16*0`($out) # store output
3409 xorps $inout0,$checksum # accumulate checksum
3414 pxor $inout2,$inout2
3415 pxor $inout3,$inout3
3419 movdqa @offset[1],@offset[5]
3420 movups $inout0,`16*0`($out) # store output
3421 xorps $inout0,$checksum # accumulate checksum
3422 movups $inout1,`16*1`($out)
3423 xorps $inout1,$checksum
3429 pxor $inout3,$inout3
3433 movdqa @offset[2],@offset[5]
3434 movups $inout0,`16*0`($out) # store output
3435 xorps $inout0,$checksum # accumulate checksum
3436 movups $inout1,`16*1`($out)
3437 xorps $inout1,$checksum
3438 movups $inout2,`16*2`($out)
3439 xorps $inout2,$checksum
3447 movdqa @offset[3],@offset[5]
3448 movups $inout0,`16*0`($out) # store output
3449 pxor $inout0,$checksum # accumulate checksum
3450 movups $inout1,`16*1`($out)
3451 pxor $inout1,$checksum
3452 movups $inout2,`16*2`($out)
3453 pxor $inout2,$checksum
3454 movups $inout3,`16*3`($out)
3455 pxor $inout3,$checksum
3458 pxor $rndkey0,@offset[5] # "remove" round[last]
3459 movdqu $checksum,($checksum_p) # store checksum
3460 movdqu @offset[5],($offset_p) # store last offset_i
3462 xorps %xmm0,%xmm0 # clear register bank
3469 $code.=<<___ if (!$win64);
3483 $code.=<<___ if ($win64);
3484 movaps 0x00(%rsp),%xmm6
3485 movaps %xmm0,0x00(%rsp) # clear stack
3486 movaps 0x10(%rsp),%xmm7
3487 movaps %xmm0,0x10(%rsp)
3488 movaps 0x20(%rsp),%xmm8
3489 movaps %xmm0,0x20(%rsp)
3490 movaps 0x30(%rsp),%xmm9
3491 movaps %xmm0,0x30(%rsp)
3492 movaps 0x40(%rsp),%xmm10
3493 movaps %xmm0,0x40(%rsp)
3494 movaps 0x50(%rsp),%xmm11
3495 movaps %xmm0,0x50(%rsp)
3496 movaps 0x60(%rsp),%xmm12
3497 movaps %xmm0,0x60(%rsp)
3498 movaps 0x70(%rsp),%xmm13
3499 movaps %xmm0,0x70(%rsp)
3500 movaps 0x80(%rsp),%xmm14
3501 movaps %xmm0,0x80(%rsp)
3502 movaps 0x90(%rsp),%xmm15
3503 movaps %xmm0,0x90(%rsp)
3504 lea 0xa0+0x28(%rsp),%rax
3519 .cfi_def_cfa_register %rsp
3523 .size aesni_ocb_decrypt,.-aesni_ocb_decrypt
3525 .type __ocb_decrypt6,\@abi-omnipotent
3529 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3530 movdqu ($L_p,$i1),@offset[1]
3531 movdqa @offset[0],@offset[2]
3532 movdqu ($L_p,$i3),@offset[3]
3533 movdqa @offset[0],@offset[4]
3534 pxor @offset[5],@offset[0]
3535 movdqu ($L_p,$i5),@offset[5]
3536 pxor @offset[0],@offset[1]
3537 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3538 pxor @offset[1],@offset[2]
3539 pxor @offset[1],$inout1
3540 pxor @offset[2],@offset[3]
3541 pxor @offset[2],$inout2
3542 pxor @offset[3],@offset[4]
3543 pxor @offset[3],$inout3
3544 pxor @offset[4],@offset[5]
3545 pxor @offset[4],$inout4
3546 pxor @offset[5],$inout5
3547 $movkey 32($key_),$rndkey0
3549 lea 1($block_num),$i1 # even-numbered blocks
3550 lea 3($block_num),$i3
3551 lea 5($block_num),$i5
3553 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3554 bsf $i1,$i1 # ntz(block)
3558 aesdec $rndkey1,$inout0
3559 aesdec $rndkey1,$inout1
3560 aesdec $rndkey1,$inout2
3561 aesdec $rndkey1,$inout3
3562 pxor $rndkey0l,@offset[1]
3563 pxor $rndkey0l,@offset[2]
3564 aesdec $rndkey1,$inout4
3565 pxor $rndkey0l,@offset[3]
3566 pxor $rndkey0l,@offset[4]
3567 aesdec $rndkey1,$inout5
3568 $movkey 48($key_),$rndkey1
3569 pxor $rndkey0l,@offset[5]
3571 aesdec $rndkey0,$inout0
3572 aesdec $rndkey0,$inout1
3573 aesdec $rndkey0,$inout2
3574 aesdec $rndkey0,$inout3
3575 aesdec $rndkey0,$inout4
3576 aesdec $rndkey0,$inout5
3577 $movkey 64($key_),$rndkey0
3578 shl \$4,$i1 # ntz(block) -> table offset
3584 aesdec $rndkey1,$inout0
3585 aesdec $rndkey1,$inout1
3586 aesdec $rndkey1,$inout2
3587 aesdec $rndkey1,$inout3
3588 aesdec $rndkey1,$inout4
3589 aesdec $rndkey1,$inout5
3590 $movkey ($key,%rax),$rndkey1
3593 aesdec $rndkey0,$inout0
3594 aesdec $rndkey0,$inout1
3595 aesdec $rndkey0,$inout2
3596 aesdec $rndkey0,$inout3
3597 aesdec $rndkey0,$inout4
3598 aesdec $rndkey0,$inout5
3599 $movkey -16($key,%rax),$rndkey0
3602 aesdec $rndkey1,$inout0
3603 aesdec $rndkey1,$inout1
3604 aesdec $rndkey1,$inout2
3605 aesdec $rndkey1,$inout3
3606 aesdec $rndkey1,$inout4
3607 aesdec $rndkey1,$inout5
3608 $movkey 16($key_),$rndkey1
3611 aesdeclast @offset[0],$inout0
3612 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3613 mov %r10,%rax # restore twisted rounds
3614 aesdeclast @offset[1],$inout1
3615 aesdeclast @offset[2],$inout2
3616 aesdeclast @offset[3],$inout3
3617 aesdeclast @offset[4],$inout4
3618 aesdeclast @offset[5],$inout5
3621 .size __ocb_decrypt6,.-__ocb_decrypt6
3623 .type __ocb_decrypt4,\@abi-omnipotent
3627 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3628 movdqu ($L_p,$i1),@offset[1]
3629 movdqa @offset[0],@offset[2]
3630 movdqu ($L_p,$i3),@offset[3]
3631 pxor @offset[5],@offset[0]
3632 pxor @offset[0],@offset[1]
3633 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3634 pxor @offset[1],@offset[2]
3635 pxor @offset[1],$inout1
3636 pxor @offset[2],@offset[3]
3637 pxor @offset[2],$inout2
3638 pxor @offset[3],$inout3
3639 $movkey 32($key_),$rndkey0
3641 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3642 pxor $rndkey0l,@offset[1]
3643 pxor $rndkey0l,@offset[2]
3644 pxor $rndkey0l,@offset[3]
3646 aesdec $rndkey1,$inout0
3647 aesdec $rndkey1,$inout1
3648 aesdec $rndkey1,$inout2
3649 aesdec $rndkey1,$inout3
3650 $movkey 48($key_),$rndkey1
3652 aesdec $rndkey0,$inout0
3653 aesdec $rndkey0,$inout1
3654 aesdec $rndkey0,$inout2
3655 aesdec $rndkey0,$inout3
3656 $movkey 64($key_),$rndkey0
3661 aesdec $rndkey1,$inout0
3662 aesdec $rndkey1,$inout1
3663 aesdec $rndkey1,$inout2
3664 aesdec $rndkey1,$inout3
3665 $movkey ($key,%rax),$rndkey1
3668 aesdec $rndkey0,$inout0
3669 aesdec $rndkey0,$inout1
3670 aesdec $rndkey0,$inout2
3671 aesdec $rndkey0,$inout3
3672 $movkey -16($key,%rax),$rndkey0
3675 aesdec $rndkey1,$inout0
3676 aesdec $rndkey1,$inout1
3677 aesdec $rndkey1,$inout2
3678 aesdec $rndkey1,$inout3
3679 $movkey 16($key_),$rndkey1
3680 mov %r10,%rax # restore twisted rounds
3682 aesdeclast @offset[0],$inout0
3683 aesdeclast @offset[1],$inout1
3684 aesdeclast @offset[2],$inout2
3685 aesdeclast @offset[3],$inout3
3688 .size __ocb_decrypt4,.-__ocb_decrypt4
3690 .type __ocb_decrypt1,\@abi-omnipotent
3694 pxor @offset[5],$inout5 # offset_i
3695 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3696 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3697 $movkey 32($key_),$rndkey0
3699 aesdec $rndkey1,$inout0
3700 $movkey 48($key_),$rndkey1
3701 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3703 aesdec $rndkey0,$inout0
3704 $movkey 64($key_),$rndkey0
3709 aesdec $rndkey1,$inout0
3710 $movkey ($key,%rax),$rndkey1
3713 aesdec $rndkey0,$inout0
3714 $movkey -16($key,%rax),$rndkey0
3717 aesdec $rndkey1,$inout0
3718 $movkey 16($key_),$rndkey1 # redundant in tail
3719 mov %r10,%rax # restore twisted rounds
3721 aesdeclast $inout5,$inout0
3724 .size __ocb_decrypt1,.-__ocb_decrypt1
3728 ########################################################################
3729 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
3730 # size_t length, const AES_KEY *key,
3731 # unsigned char *ivp,const int enc);
3733 my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
3734 my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
3737 .globl ${PREFIX}_cbc_encrypt
3738 .type ${PREFIX}_cbc_encrypt,\@function,6
3740 ${PREFIX}_cbc_encrypt:
3742 test $len,$len # check length
3745 mov 240($key),$rnds_ # key->rounds
3746 mov $key,$key_ # backup $key
3747 test %r9d,%r9d # 6th argument
3749 #--------------------------- CBC ENCRYPT ------------------------------#
3750 movups ($ivp),$inout0 # load iv as initial state
3758 movups ($inp),$inout1 # load input
3760 #xorps $inout1,$inout0
3762 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
3764 mov $rnds_,$rounds # restore $rounds
3765 mov $key_,$key # restore $key
3766 movups $inout0,0($out) # store output
3772 pxor $rndkey0,$rndkey0 # clear register bank
3773 pxor $rndkey1,$rndkey1
3774 movups $inout0,($ivp)
3775 pxor $inout0,$inout0
3776 pxor $inout1,$inout1
3780 mov $len,%rcx # zaps $key
3781 xchg $inp,$out # $inp is %rsi and $out is %rdi now
3782 .long 0x9066A4F3 # rep movsb
3783 mov \$16,%ecx # zero tail
3786 .long 0x9066AAF3 # rep stosb
3787 lea -16(%rdi),%rdi # rewind $out by 1 block
3788 mov $rnds_,$rounds # restore $rounds
3789 mov %rdi,%rsi # $inp and $out are the same
3790 mov $key_,$key # restore $key
3791 xor $len,$len # len=16
3792 jmp .Lcbc_enc_loop # one more spin
3793 \f#--------------------------- CBC DECRYPT ------------------------------#
3797 jne .Lcbc_decrypt_bulk
3799 # handle single block without allocating stack frame,
3800 # useful in ciphertext stealing mode
3801 movdqu ($inp),$inout0 # load input
3802 movdqu ($ivp),$inout1 # load iv
3803 movdqa $inout0,$inout2 # future iv
3805 &aesni_generate1("dec",$key,$rnds_);
3807 pxor $rndkey0,$rndkey0 # clear register bank
3808 pxor $rndkey1,$rndkey1
3809 movdqu $inout2,($ivp) # store iv
3810 xorps $inout1,$inout0 # ^=iv
3811 pxor $inout1,$inout1
3812 movups $inout0,($out) # store output
3813 pxor $inout0,$inout0
3817 lea (%rsp),%r11 # frame pointer
3818 .cfi_def_cfa_register %r11
3821 sub \$$frame_size,%rsp
3822 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
3824 $code.=<<___ if ($win64);
3825 movaps %xmm6,0x10(%rsp)
3826 movaps %xmm7,0x20(%rsp)
3827 movaps %xmm8,0x30(%rsp)
3828 movaps %xmm9,0x40(%rsp)
3829 movaps %xmm10,0x50(%rsp)
3830 movaps %xmm11,0x60(%rsp)
3831 movaps %xmm12,0x70(%rsp)
3832 movaps %xmm13,0x80(%rsp)
3833 movaps %xmm14,0x90(%rsp)
3834 movaps %xmm15,0xa0(%rsp)
3838 my $inp_=$key_="%rbp"; # reassign $key_
3841 mov $key,$key_ # [re-]backup $key [after reassignment]
3847 $movkey ($key),$rndkey0
3848 movdqu 0x00($inp),$inout0 # load input
3849 movdqu 0x10($inp),$inout1
3851 movdqu 0x20($inp),$inout2
3853 movdqu 0x30($inp),$inout3
3855 movdqu 0x40($inp),$inout4
3857 movdqu 0x50($inp),$inout5
3859 mov OPENSSL_ia32cap_P+4(%rip),%r9d
3861 jbe .Lcbc_dec_six_or_seven
3863 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE
3864 sub \$0x50,$len # $len is biased by -5*16
3865 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE
3866 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont]
3867 sub \$0x20,$len # $len is biased by -7*16
3868 lea 0x70($key),$key # size optimization
3869 jmp .Lcbc_dec_loop8_enter
3872 movups $inout7,($out)
3874 .Lcbc_dec_loop8_enter:
3875 movdqu 0x60($inp),$inout6
3876 pxor $rndkey0,$inout0
3877 movdqu 0x70($inp),$inout7
3878 pxor $rndkey0,$inout1
3879 $movkey 0x10-0x70($key),$rndkey1
3880 pxor $rndkey0,$inout2
3882 cmp \$0x70,$len # is there at least 0x60 bytes ahead?
3883 pxor $rndkey0,$inout3
3884 pxor $rndkey0,$inout4
3885 pxor $rndkey0,$inout5
3886 pxor $rndkey0,$inout6
3888 aesdec $rndkey1,$inout0
3889 pxor $rndkey0,$inout7
3890 $movkey 0x20-0x70($key),$rndkey0
3891 aesdec $rndkey1,$inout1
3892 aesdec $rndkey1,$inout2
3893 aesdec $rndkey1,$inout3
3894 aesdec $rndkey1,$inout4
3895 aesdec $rndkey1,$inout5
3896 aesdec $rndkey1,$inout6
3899 aesdec $rndkey1,$inout7
3901 $movkey 0x30-0x70($key),$rndkey1
3903 for($i=1;$i<12;$i++) {
3904 my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
3905 $code.=<<___ if ($i==7);
3909 aesdec $rndkeyx,$inout0
3910 aesdec $rndkeyx,$inout1
3911 aesdec $rndkeyx,$inout2
3912 aesdec $rndkeyx,$inout3
3913 aesdec $rndkeyx,$inout4
3914 aesdec $rndkeyx,$inout5
3915 aesdec $rndkeyx,$inout6
3916 aesdec $rndkeyx,$inout7
3917 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx
3919 $code.=<<___ if ($i<6 || (!($i&1) && $i>7));
3922 $code.=<<___ if ($i==7);
3925 $code.=<<___ if ($i==9);
3928 $code.=<<___ if ($i==11);
3935 aesdec $rndkey1,$inout0
3936 aesdec $rndkey1,$inout1
3939 aesdec $rndkey1,$inout2
3940 aesdec $rndkey1,$inout3
3943 aesdec $rndkey1,$inout4
3944 aesdec $rndkey1,$inout5
3947 aesdec $rndkey1,$inout6
3948 aesdec $rndkey1,$inout7
3949 movdqu 0x50($inp),$rndkey1
3951 aesdeclast $iv,$inout0
3952 movdqu 0x60($inp),$iv # borrow $iv
3953 pxor $rndkey0,$rndkey1
3954 aesdeclast $in0,$inout1
3956 movdqu 0x70($inp),$rndkey0 # next IV
3957 aesdeclast $in1,$inout2
3959 movdqu 0x00($inp_),$in0
3960 aesdeclast $in2,$inout3
3961 aesdeclast $in3,$inout4
3962 movdqu 0x10($inp_),$in1
3963 movdqu 0x20($inp_),$in2
3964 aesdeclast $in4,$inout5
3965 aesdeclast $rndkey1,$inout6
3966 movdqu 0x30($inp_),$in3
3967 movdqu 0x40($inp_),$in4
3968 aesdeclast $iv,$inout7
3969 movdqa $rndkey0,$iv # return $iv
3970 movdqu 0x50($inp_),$rndkey1
3971 $movkey -0x70($key),$rndkey0
3973 movups $inout0,($out) # store output
3975 movups $inout1,0x10($out)
3977 movups $inout2,0x20($out)
3979 movups $inout3,0x30($out)
3981 movups $inout4,0x40($out)
3983 movups $inout5,0x50($out)
3984 movdqa $rndkey1,$inout5
3985 movups $inout6,0x60($out)
3991 movaps $inout7,$inout0
3992 lea -0x70($key),$key
3994 jle .Lcbc_dec_clear_tail_collected
3995 movups $inout7,($out)
4001 .Lcbc_dec_six_or_seven:
4005 movaps $inout5,$inout6
4006 call _aesni_decrypt6
4007 pxor $iv,$inout0 # ^= IV
4010 movdqu $inout0,($out)
4012 movdqu $inout1,0x10($out)
4013 pxor $inout1,$inout1 # clear register bank
4015 movdqu $inout2,0x20($out)
4016 pxor $inout2,$inout2
4018 movdqu $inout3,0x30($out)
4019 pxor $inout3,$inout3
4021 movdqu $inout4,0x40($out)
4022 pxor $inout4,$inout4
4024 movdqa $inout5,$inout0
4025 pxor $inout5,$inout5
4026 jmp .Lcbc_dec_tail_collected
4030 movups 0x60($inp),$inout6
4031 xorps $inout7,$inout7
4032 call _aesni_decrypt8
4033 movups 0x50($inp),$inout7
4034 pxor $iv,$inout0 # ^= IV
4035 movups 0x60($inp),$iv
4037 movdqu $inout0,($out)
4039 movdqu $inout1,0x10($out)
4040 pxor $inout1,$inout1 # clear register bank
4042 movdqu $inout2,0x20($out)
4043 pxor $inout2,$inout2
4045 movdqu $inout3,0x30($out)
4046 pxor $inout3,$inout3
4048 movdqu $inout4,0x40($out)
4049 pxor $inout4,$inout4
4050 pxor $inout7,$inout6
4051 movdqu $inout5,0x50($out)
4052 pxor $inout5,$inout5
4054 movdqa $inout6,$inout0
4055 pxor $inout6,$inout6
4056 pxor $inout7,$inout7
4057 jmp .Lcbc_dec_tail_collected
4061 movups $inout5,($out)
4063 movdqu 0x00($inp),$inout0 # load input
4064 movdqu 0x10($inp),$inout1
4066 movdqu 0x20($inp),$inout2
4068 movdqu 0x30($inp),$inout3
4070 movdqu 0x40($inp),$inout4
4072 movdqu 0x50($inp),$inout5
4074 .Lcbc_dec_loop6_enter:
4076 movdqa $inout5,$inout6
4078 call _aesni_decrypt6
4080 pxor $iv,$inout0 # ^= IV
4083 movdqu $inout0,($out)
4085 movdqu $inout1,0x10($out)
4087 movdqu $inout2,0x20($out)
4090 movdqu $inout3,0x30($out)
4093 movdqu $inout4,0x40($out)
4098 movdqa $inout5,$inout0
4100 jle .Lcbc_dec_clear_tail_collected
4101 movups $inout5,($out)
4105 movups ($inp),$inout0
4107 jbe .Lcbc_dec_one # $len is 1*16 or less
4109 movups 0x10($inp),$inout1
4112 jbe .Lcbc_dec_two # $len is 2*16 or less
4114 movups 0x20($inp),$inout2
4117 jbe .Lcbc_dec_three # $len is 3*16 or less
4119 movups 0x30($inp),$inout3
4122 jbe .Lcbc_dec_four # $len is 4*16 or less
4124 movups 0x40($inp),$inout4 # $len is 5*16 or less
4127 xorps $inout5,$inout5
4128 call _aesni_decrypt6
4132 movdqu $inout0,($out)
4134 movdqu $inout1,0x10($out)
4135 pxor $inout1,$inout1 # clear register bank
4137 movdqu $inout2,0x20($out)
4138 pxor $inout2,$inout2
4140 movdqu $inout3,0x30($out)
4141 pxor $inout3,$inout3
4143 movdqa $inout4,$inout0
4144 pxor $inout4,$inout4
4145 pxor $inout5,$inout5
4147 jmp .Lcbc_dec_tail_collected
4153 &aesni_generate1("dec",$key,$rounds);
4157 jmp .Lcbc_dec_tail_collected
4161 call _aesni_decrypt2
4165 movdqu $inout0,($out)
4166 movdqa $inout1,$inout0
4167 pxor $inout1,$inout1 # clear register bank
4169 jmp .Lcbc_dec_tail_collected
4173 call _aesni_decrypt3
4177 movdqu $inout0,($out)
4179 movdqu $inout1,0x10($out)
4180 pxor $inout1,$inout1 # clear register bank
4181 movdqa $inout2,$inout0
4182 pxor $inout2,$inout2
4184 jmp .Lcbc_dec_tail_collected
4188 call _aesni_decrypt4
4192 movdqu $inout0,($out)
4194 movdqu $inout1,0x10($out)
4195 pxor $inout1,$inout1 # clear register bank
4197 movdqu $inout2,0x20($out)
4198 pxor $inout2,$inout2
4199 movdqa $inout3,$inout0
4200 pxor $inout3,$inout3
4202 jmp .Lcbc_dec_tail_collected
4205 .Lcbc_dec_clear_tail_collected:
4206 pxor $inout1,$inout1 # clear register bank
4207 pxor $inout2,$inout2
4208 pxor $inout3,$inout3
4210 $code.=<<___ if (!$win64);
4211 pxor $inout4,$inout4 # %xmm6..9
4212 pxor $inout5,$inout5
4213 pxor $inout6,$inout6
4214 pxor $inout7,$inout7
4217 .Lcbc_dec_tail_collected:
4220 jnz .Lcbc_dec_tail_partial
4221 movups $inout0,($out)
4222 pxor $inout0,$inout0
4225 .Lcbc_dec_tail_partial:
4226 movaps $inout0,(%rsp)
4227 pxor $inout0,$inout0
4232 .long 0x9066A4F3 # rep movsb
4233 movdqa $inout0,(%rsp)
4236 xorps $rndkey0,$rndkey0 # %xmm0
4237 pxor $rndkey1,$rndkey1
4239 $code.=<<___ if ($win64);
4240 movaps 0x10(%rsp),%xmm6
4241 movaps %xmm0,0x10(%rsp) # clear stack
4242 movaps 0x20(%rsp),%xmm7
4243 movaps %xmm0,0x20(%rsp)
4244 movaps 0x30(%rsp),%xmm8
4245 movaps %xmm0,0x30(%rsp)
4246 movaps 0x40(%rsp),%xmm9
4247 movaps %xmm0,0x40(%rsp)
4248 movaps 0x50(%rsp),%xmm10
4249 movaps %xmm0,0x50(%rsp)
4250 movaps 0x60(%rsp),%xmm11
4251 movaps %xmm0,0x60(%rsp)
4252 movaps 0x70(%rsp),%xmm12
4253 movaps %xmm0,0x70(%rsp)
4254 movaps 0x80(%rsp),%xmm13
4255 movaps %xmm0,0x80(%rsp)
4256 movaps 0x90(%rsp),%xmm14
4257 movaps %xmm0,0x90(%rsp)
4258 movaps 0xa0(%rsp),%xmm15
4259 movaps %xmm0,0xa0(%rsp)
4265 .cfi_def_cfa_register %rsp
4269 .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
4272 # int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
4273 # int bits, AES_KEY *key)
4275 # input: $inp user-supplied key
4276 # $bits $inp length in bits
4277 # $key pointer to key schedule
4278 # output: %eax 0 denoting success, -1 or -2 - failure (see C)
4279 # *$key key schedule
4281 { my ($inp,$bits,$key) = @_4args;
4285 .globl ${PREFIX}_set_decrypt_key
4286 .type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
4288 ${PREFIX}_set_decrypt_key:
4290 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
4291 .cfi_adjust_cfa_offset 8
4292 call __aesni_set_encrypt_key
4293 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
4296 lea 16($key,$bits),$inp # points at the end of key schedule
4298 $movkey ($key),%xmm0 # just swap
4299 $movkey ($inp),%xmm1
4300 $movkey %xmm0,($inp)
4301 $movkey %xmm1,($key)
4306 $movkey ($key),%xmm0 # swap and inverse
4307 $movkey ($inp),%xmm1
4312 $movkey %xmm0,16($inp)
4313 $movkey %xmm1,-16($key)
4315 ja .Ldec_key_inverse
4317 $movkey ($key),%xmm0 # inverse middle
4320 $movkey %xmm0,($inp)
4324 .cfi_adjust_cfa_offset -8
4327 .LSEH_end_set_decrypt_key:
4328 .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
4331 # This is based on submission from Intel by
4336 # Aggressively optimized in respect to aeskeygenassist's critical path
4337 # and is contained in %xmm0-5 to meet Win64 ABI requirement.
4339 # int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
4340 # int bits, AES_KEY * const key);
4342 # input: $inp user-supplied key
4343 # $bits $inp length in bits
4344 # $key pointer to key schedule
4345 # output: %eax 0 denoting success, -1 or -2 - failure (see C)
4346 # $bits rounds-1 (used in aesni_set_decrypt_key)
4347 # *$key key schedule
4348 # $key pointer to key schedule (used in
4349 # aesni_set_decrypt_key)
4351 # Subroutine is frame-less, which means that only volatile registers
4352 # are used. Note that it's declared "abi-omnipotent", which means that
4353 # amount of volatile registers is smaller on Windows.
4356 .globl ${PREFIX}_set_encrypt_key
4357 .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
4359 ${PREFIX}_set_encrypt_key:
4360 __aesni_set_encrypt_key:
4362 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
4363 .cfi_adjust_cfa_offset 8
4370 mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits
4371 movups ($inp),%xmm0 # pull first 128 bits of *userKey
4372 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
4373 and OPENSSL_ia32cap_P+4(%rip),%r10d
4374 lea 16($key),%rax # %rax is used as modifiable copy of $key
4383 mov \$9,$bits # 10 rounds for 128-bit key
4384 cmp \$`1<<28`,%r10d # AVX, bit no XOP
4387 $movkey %xmm0,($key) # round 0
4388 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
4389 call .Lkey_expansion_128_cold
4390 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
4391 call .Lkey_expansion_128
4392 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
4393 call .Lkey_expansion_128
4394 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
4395 call .Lkey_expansion_128
4396 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
4397 call .Lkey_expansion_128
4398 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
4399 call .Lkey_expansion_128
4400 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
4401 call .Lkey_expansion_128
4402 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
4403 call .Lkey_expansion_128
4404 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
4405 call .Lkey_expansion_128
4406 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
4407 call .Lkey_expansion_128
4408 $movkey %xmm0,(%rax)
4409 mov $bits,80(%rax) # 240(%rdx)
4415 movdqa .Lkey_rotate(%rip),%xmm5
4417 movdqa .Lkey_rcon1(%rip),%xmm4
4425 aesenclast %xmm4,%xmm0
4438 movdqu %xmm0,-16(%rax)
4444 movdqa .Lkey_rcon1b(%rip),%xmm4
4447 aesenclast %xmm4,%xmm0
4463 aesenclast %xmm4,%xmm0
4474 movdqu %xmm0,16(%rax)
4476 mov $bits,96(%rax) # 240($key)
4482 movq 16($inp),%xmm2 # remaining 1/3 of *userKey
4483 mov \$11,$bits # 12 rounds for 192
4484 cmp \$`1<<28`,%r10d # AVX, but no XOP
4487 $movkey %xmm0,($key) # round 0
4488 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
4489 call .Lkey_expansion_192a_cold
4490 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
4491 call .Lkey_expansion_192b
4492 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
4493 call .Lkey_expansion_192a
4494 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
4495 call .Lkey_expansion_192b
4496 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
4497 call .Lkey_expansion_192a
4498 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
4499 call .Lkey_expansion_192b
4500 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
4501 call .Lkey_expansion_192a
4502 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
4503 call .Lkey_expansion_192b
4504 $movkey %xmm0,(%rax)
4505 mov $bits,48(%rax) # 240(%rdx)
4511 movdqa .Lkey_rotate192(%rip),%xmm5
4512 movdqa .Lkey_rcon1(%rip),%xmm4
4522 aesenclast %xmm4,%xmm2
4534 pshufd \$0xff,%xmm0,%xmm3
4541 movdqu %xmm0,-16(%rax)
4546 mov $bits,32(%rax) # 240($key)
4552 movups 16($inp),%xmm2 # remaining half of *userKey
4553 mov \$13,$bits # 14 rounds for 256
4555 cmp \$`1<<28`,%r10d # AVX, but no XOP
4558 $movkey %xmm0,($key) # round 0
4559 $movkey %xmm2,16($key) # round 1
4560 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
4561 call .Lkey_expansion_256a_cold
4562 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
4563 call .Lkey_expansion_256b
4564 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
4565 call .Lkey_expansion_256a
4566 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
4567 call .Lkey_expansion_256b
4568 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
4569 call .Lkey_expansion_256a
4570 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
4571 call .Lkey_expansion_256b
4572 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
4573 call .Lkey_expansion_256a
4574 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
4575 call .Lkey_expansion_256b
4576 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
4577 call .Lkey_expansion_256a
4578 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
4579 call .Lkey_expansion_256b
4580 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
4581 call .Lkey_expansion_256a
4582 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
4583 call .Lkey_expansion_256b
4584 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
4585 call .Lkey_expansion_256a
4586 $movkey %xmm0,(%rax)
4587 mov $bits,16(%rax) # 240(%rdx)
4593 movdqa .Lkey_rotate(%rip),%xmm5
4594 movdqa .Lkey_rcon1(%rip),%xmm4
4596 movdqu %xmm0,0($key)
4598 movdqu %xmm2,16($key)
4604 aesenclast %xmm4,%xmm2
4621 pshufd \$0xff,%xmm0,%xmm2
4623 aesenclast %xmm3,%xmm2
4634 movdqu %xmm2,16(%rax)
4641 mov $bits,16(%rax) # 240($key)
4656 .cfi_adjust_cfa_offset -8
4658 .LSEH_end_set_encrypt_key:
4661 .Lkey_expansion_128:
4662 $movkey %xmm0,(%rax)
4664 .Lkey_expansion_128_cold:
4665 shufps \$0b00010000,%xmm0,%xmm4
4667 shufps \$0b10001100,%xmm0,%xmm4
4669 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4674 .Lkey_expansion_192a:
4675 $movkey %xmm0,(%rax)
4677 .Lkey_expansion_192a_cold:
4679 .Lkey_expansion_192b_warm:
4680 shufps \$0b00010000,%xmm0,%xmm4
4683 shufps \$0b10001100,%xmm0,%xmm4
4686 pshufd \$0b01010101,%xmm1,%xmm1 # critical path
4689 pshufd \$0b11111111,%xmm0,%xmm3
4694 .Lkey_expansion_192b:
4696 shufps \$0b01000100,%xmm0,%xmm5
4697 $movkey %xmm5,(%rax)
4698 shufps \$0b01001110,%xmm2,%xmm3
4699 $movkey %xmm3,16(%rax)
4701 jmp .Lkey_expansion_192b_warm
4704 .Lkey_expansion_256a:
4705 $movkey %xmm2,(%rax)
4707 .Lkey_expansion_256a_cold:
4708 shufps \$0b00010000,%xmm0,%xmm4
4710 shufps \$0b10001100,%xmm0,%xmm4
4712 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4717 .Lkey_expansion_256b:
4718 $movkey %xmm0,(%rax)
4721 shufps \$0b00010000,%xmm2,%xmm4
4723 shufps \$0b10001100,%xmm2,%xmm4
4725 shufps \$0b10101010,%xmm1,%xmm1 # critical path
4729 .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
4730 .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
4737 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
4745 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4747 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
4749 .long 0x04070605,0x04070605,0x04070605,0x04070605
4753 .long 0x1b,0x1b,0x1b,0x1b
4755 .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
4759 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4760 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
4768 .extern __imp_RtlVirtualUnwind
4770 $code.=<<___ if ($PREFIX eq "aesni");
4771 .type ecb_ccm64_se_handler,\@abi-omnipotent
4773 ecb_ccm64_se_handler:
4785 mov 120($context),%rax # pull context->Rax
4786 mov 248($context),%rbx # pull context->Rip
4788 mov 8($disp),%rsi # disp->ImageBase
4789 mov 56($disp),%r11 # disp->HandlerData
4791 mov 0(%r11),%r10d # HandlerData[0]
4792 lea (%rsi,%r10),%r10 # prologue label
4793 cmp %r10,%rbx # context->Rip<prologue label
4794 jb .Lcommon_seh_tail
4796 mov 152($context),%rax # pull context->Rsp
4798 mov 4(%r11),%r10d # HandlerData[1]
4799 lea (%rsi,%r10),%r10 # epilogue label
4800 cmp %r10,%rbx # context->Rip>=epilogue label
4801 jae .Lcommon_seh_tail
4803 lea 0(%rax),%rsi # %xmm save area
4804 lea 512($context),%rdi # &context.Xmm6
4805 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
4806 .long 0xa548f3fc # cld; rep movsq
4807 lea 0x58(%rax),%rax # adjust stack pointer
4809 jmp .Lcommon_seh_tail
4810 .size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
4812 .type ctr_xts_se_handler,\@abi-omnipotent
4826 mov 120($context),%rax # pull context->Rax
4827 mov 248($context),%rbx # pull context->Rip
4829 mov 8($disp),%rsi # disp->ImageBase
4830 mov 56($disp),%r11 # disp->HandlerData
4832 mov 0(%r11),%r10d # HandlerData[0]
4833 lea (%rsi,%r10),%r10 # prologue label
4834 cmp %r10,%rbx # context->Rip<prologue label
4835 jb .Lcommon_seh_tail
4837 mov 152($context),%rax # pull context->Rsp
4839 mov 4(%r11),%r10d # HandlerData[1]
4840 lea (%rsi,%r10),%r10 # epilogue label
4841 cmp %r10,%rbx # context->Rip>=epilogue label
4842 jae .Lcommon_seh_tail
4844 mov 208($context),%rax # pull context->R11
4846 lea -0xa8(%rax),%rsi # %xmm save area
4847 lea 512($context),%rdi # & context.Xmm6
4848 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4849 .long 0xa548f3fc # cld; rep movsq
4851 mov -8(%rax),%rbp # restore saved %rbp
4852 mov %rbp,160($context) # restore context->Rbp
4853 jmp .Lcommon_seh_tail
4854 .size ctr_xts_se_handler,.-ctr_xts_se_handler
4856 .type ocb_se_handler,\@abi-omnipotent
4870 mov 120($context),%rax # pull context->Rax
4871 mov 248($context),%rbx # pull context->Rip
4873 mov 8($disp),%rsi # disp->ImageBase
4874 mov 56($disp),%r11 # disp->HandlerData
4876 mov 0(%r11),%r10d # HandlerData[0]
4877 lea (%rsi,%r10),%r10 # prologue label
4878 cmp %r10,%rbx # context->Rip<prologue label
4879 jb .Lcommon_seh_tail
4881 mov 4(%r11),%r10d # HandlerData[1]
4882 lea (%rsi,%r10),%r10 # epilogue label
4883 cmp %r10,%rbx # context->Rip>=epilogue label
4884 jae .Lcommon_seh_tail
4886 mov 8(%r11),%r10d # HandlerData[2]
4887 lea (%rsi,%r10),%r10
4888 cmp %r10,%rbx # context->Rip>=pop label
4891 mov 152($context),%rax # pull context->Rsp
4893 lea (%rax),%rsi # %xmm save area
4894 lea 512($context),%rdi # & context.Xmm6
4895 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4896 .long 0xa548f3fc # cld; rep movsq
4897 lea 0xa0+0x28(%rax),%rax
4906 mov %rbx,144($context) # restore context->Rbx
4907 mov %rbp,160($context) # restore context->Rbp
4908 mov %r12,216($context) # restore context->R12
4909 mov %r13,224($context) # restore context->R13
4910 mov %r14,232($context) # restore context->R14
4912 jmp .Lcommon_seh_tail
4913 .size ocb_se_handler,.-ocb_se_handler
4916 .type cbc_se_handler,\@abi-omnipotent
4930 mov 152($context),%rax # pull context->Rsp
4931 mov 248($context),%rbx # pull context->Rip
4933 lea .Lcbc_decrypt_bulk(%rip),%r10
4934 cmp %r10,%rbx # context->Rip<"prologue" label
4935 jb .Lcommon_seh_tail
4937 mov 120($context),%rax # pull context->Rax
4939 lea .Lcbc_decrypt_body(%rip),%r10
4940 cmp %r10,%rbx # context->Rip<cbc_decrypt_body
4941 jb .Lcommon_seh_tail
4943 mov 152($context),%rax # pull context->Rsp
4945 lea .Lcbc_ret(%rip),%r10
4946 cmp %r10,%rbx # context->Rip>="epilogue" label
4947 jae .Lcommon_seh_tail
4949 lea 16(%rax),%rsi # %xmm save area
4950 lea 512($context),%rdi # &context.Xmm6
4951 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4952 .long 0xa548f3fc # cld; rep movsq
4954 mov 208($context),%rax # pull context->R11
4956 mov -8(%rax),%rbp # restore saved %rbp
4957 mov %rbp,160($context) # restore context->Rbp
4962 mov %rax,152($context) # restore context->Rsp
4963 mov %rsi,168($context) # restore context->Rsi
4964 mov %rdi,176($context) # restore context->Rdi
4966 mov 40($disp),%rdi # disp->ContextRecord
4967 mov $context,%rsi # context
4968 mov \$154,%ecx # sizeof(CONTEXT)
4969 .long 0xa548f3fc # cld; rep movsq
4972 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
4973 mov 8(%rsi),%rdx # arg2, disp->ImageBase
4974 mov 0(%rsi),%r8 # arg3, disp->ControlPc
4975 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4976 mov 40(%rsi),%r10 # disp->ContextRecord
4977 lea 56(%rsi),%r11 # &disp->HandlerData
4978 lea 24(%rsi),%r12 # &disp->EstablisherFrame
4979 mov %r10,32(%rsp) # arg5
4980 mov %r11,40(%rsp) # arg6
4981 mov %r12,48(%rsp) # arg7
4982 mov %rcx,56(%rsp) # arg8, (NULL)
4983 call *__imp_RtlVirtualUnwind(%rip)
4985 mov \$1,%eax # ExceptionContinueSearch
4997 .size cbc_se_handler,.-cbc_se_handler
5002 $code.=<<___ if ($PREFIX eq "aesni");
5003 .rva .LSEH_begin_aesni_ecb_encrypt
5004 .rva .LSEH_end_aesni_ecb_encrypt
5007 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
5008 .rva .LSEH_end_aesni_ccm64_encrypt_blocks
5009 .rva .LSEH_info_ccm64_enc
5011 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
5012 .rva .LSEH_end_aesni_ccm64_decrypt_blocks
5013 .rva .LSEH_info_ccm64_dec
5015 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
5016 .rva .LSEH_end_aesni_ctr32_encrypt_blocks
5017 .rva .LSEH_info_ctr32
5019 .rva .LSEH_begin_aesni_xts_encrypt
5020 .rva .LSEH_end_aesni_xts_encrypt
5021 .rva .LSEH_info_xts_enc
5023 .rva .LSEH_begin_aesni_xts_decrypt
5024 .rva .LSEH_end_aesni_xts_decrypt
5025 .rva .LSEH_info_xts_dec
5027 .rva .LSEH_begin_aesni_ocb_encrypt
5028 .rva .LSEH_end_aesni_ocb_encrypt
5029 .rva .LSEH_info_ocb_enc
5031 .rva .LSEH_begin_aesni_ocb_decrypt
5032 .rva .LSEH_end_aesni_ocb_decrypt
5033 .rva .LSEH_info_ocb_dec
5036 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
5037 .rva .LSEH_end_${PREFIX}_cbc_encrypt
5040 .rva ${PREFIX}_set_decrypt_key
5041 .rva .LSEH_end_set_decrypt_key
5044 .rva ${PREFIX}_set_encrypt_key
5045 .rva .LSEH_end_set_encrypt_key
5050 $code.=<<___ if ($PREFIX eq "aesni");
5053 .rva ecb_ccm64_se_handler
5054 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[]
5055 .LSEH_info_ccm64_enc:
5057 .rva ecb_ccm64_se_handler
5058 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
5059 .LSEH_info_ccm64_dec:
5061 .rva ecb_ccm64_se_handler
5062 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
5065 .rva ctr_xts_se_handler
5066 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
5069 .rva ctr_xts_se_handler
5070 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
5073 .rva ctr_xts_se_handler
5074 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
5078 .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[]
5084 .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[]
5093 .byte 0x01,0x04,0x01,0x00
5094 .byte 0x04,0x02,0x00,0x00 # sub rsp,8
5099 local *opcode=shift;
5103 $rex|=0x04 if($dst>=8);
5104 $rex|=0x01 if($src>=8);
5105 push @opcode,$rex|0x40 if($rex);
5112 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5113 rex(\@opcode,$4,$3);
5114 push @opcode,0x0f,0x3a,0xdf;
5115 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
5117 push @opcode,$c=~/^0/?oct($c):$c;
5118 return ".byte\t".join(',',@opcode);
5120 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5123 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5124 "aesdec" => 0xde, "aesdeclast" => 0xdf
5126 return undef if (!defined($opcodelet{$1}));
5127 rex(\@opcode,$3,$2);
5128 push @opcode,0x0f,0x38,$opcodelet{$1};
5129 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
5130 return ".byte\t".join(',',@opcode);
5132 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
5134 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5135 "aesdec" => 0xde, "aesdeclast" => 0xdf
5137 return undef if (!defined($opcodelet{$1}));
5139 push @opcode,0x44 if ($3>=8);
5140 push @opcode,0x0f,0x38,$opcodelet{$1};
5141 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
5142 push @opcode,($off=~/^0/?oct($off):$off)&0xff;
5143 return ".byte\t".join(',',@opcode);
5149 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
5152 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
5153 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
5154 #$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
5155 $code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
5159 close STDOUT or die "error closing STDOUT";