2 # Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # This module implements support for Intel AES-NI extension. In
18 # OpenSSL context it's used with Intel engine, but can also be used as
19 # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
24 # Given aes(enc|dec) instructions' latency asymptotic performance for
25 # non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
26 # processed with 128-bit key. And given their throughput asymptotic
27 # performance for parallelizable modes is 1.25 cycles per byte. Being
28 # asymptotic limit it's not something you commonly achieve in reality,
29 # but how close does one get? Below are results collected for
30 # different modes and block sized. Pairs of numbers are for en-/
33 # 16-byte 64-byte 256-byte 1-KB 8-KB
34 # ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
35 # CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
36 # CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
37 # CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
38 # OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
39 # CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
41 # ECB, CTR, CBC and CCM results are free from EVP overhead. This means
42 # that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
43 # [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
44 # The results were collected with specially crafted speed.c benchmark
45 # in order to compare them with results reported in "Intel Advanced
46 # Encryption Standard (AES) New Instruction Set" White Paper Revision
47 # 3.0 dated May 2010. All above results are consistently better. This
48 # module also provides better performance for block sizes smaller than
49 # 128 bytes in points *not* represented in the above table.
51 # Looking at the results for 8-KB buffer.
53 # CFB and OFB results are far from the limit, because implementation
54 # uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
55 # single-block aesni_encrypt, which is not the most optimal way to go.
56 # CBC encrypt result is unexpectedly high and there is no documented
57 # explanation for it. Seemingly there is a small penalty for feeding
58 # the result back to AES unit the way it's done in CBC mode. There is
59 # nothing one can do and the result appears optimal. CCM result is
60 # identical to CBC, because CBC-MAC is essentially CBC encrypt without
61 # saving output. CCM CTR "stays invisible," because it's neatly
62 # interleaved wih CBC-MAC. This provides ~30% improvement over
63 # "straightforward" CCM implementation with CTR and CBC-MAC performed
64 # disjointly. Parallelizable modes practically achieve the theoretical
67 # Looking at how results vary with buffer size.
69 # Curves are practically saturated at 1-KB buffer size. In most cases
70 # "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
71 # CTR curve doesn't follow this pattern and is "slowest" changing one
72 # with "256-byte" result being 87% of "8-KB." This is because overhead
73 # in CTR mode is most computationally intensive. Small-block CCM
74 # decrypt is slower than encrypt, because first CTR and last CBC-MAC
75 # iterations can't be interleaved.
77 # Results for 192- and 256-bit keys.
79 # EVP-free results were observed to scale perfectly with number of
80 # rounds for larger block sizes, i.e. 192-bit result being 10/12 times
81 # lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
82 # are a tad smaller, because the above mentioned penalty biases all
83 # results by same constant value. In similar way function call
84 # overhead affects small-block performance, as well as OFB and CFB
85 # results. Differences are not large, most common coefficients are
86 # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
87 # observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
91 # While Westmere processor features 6 cycles latency for aes[enc|dec]
92 # instructions, which can be scheduled every second cycle, Sandy
93 # Bridge spends 8 cycles per instruction, but it can schedule them
94 # every cycle. This means that code targeting Westmere would perform
95 # suboptimally on Sandy Bridge. Therefore this update.
97 # In addition, non-parallelizable CBC encrypt (as well as CCM) is
98 # optimized. Relative improvement might appear modest, 8% on Westmere,
99 # but in absolute terms it's 3.77 cycles per byte encrypted with
100 # 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
101 # should be compared to asymptotic limits of 3.75 for Westmere and
102 # 5.00 for Sandy Bridge. Actually, the fact that they get this close
103 # to asymptotic limits is quite amazing. Indeed, the limit is
104 # calculated as latency times number of rounds, 10 for 128-bit key,
105 # and divided by 16, the number of bytes in block, or in other words
106 # it accounts *solely* for aesenc instructions. But there are extra
107 # instructions, and numbers so close to the asymptotic limits mean
108 # that it's as if it takes as little as *one* additional cycle to
109 # execute all of them. How is it possible? It is possible thanks to
110 # out-of-order execution logic, which manages to overlap post-
111 # processing of previous block, things like saving the output, with
112 # actual encryption of current block, as well as pre-processing of
113 # current block, things like fetching input and xor-ing it with
114 # 0-round element of the key schedule, with actual encryption of
115 # previous block. Keep this in mind...
117 # For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
118 # performance is achieved by interleaving instructions working on
119 # independent blocks. In which case asymptotic limit for such modes
120 # can be obtained by dividing above mentioned numbers by AES
121 # instructions' interleave factor. Westmere can execute at most 3
122 # instructions at a time, meaning that optimal interleave factor is 3,
123 # and that's where the "magic" number of 1.25 come from. "Optimal
124 # interleave factor" means that increase of interleave factor does
125 # not improve performance. The formula has proven to reflect reality
126 # pretty well on Westmere... Sandy Bridge on the other hand can
127 # execute up to 8 AES instructions at a time, so how does varying
128 # interleave factor affect the performance? Here is table for ECB
129 # (numbers are cycles per byte processed with 128-bit key):
131 # instruction interleave factor 3x 6x 8x
132 # theoretical asymptotic limit 1.67 0.83 0.625
133 # measured performance for 8KB block 1.05 0.86 0.84
135 # "as if" interleave factor 4.7x 5.8x 6.0x
137 # Further data for other parallelizable modes:
139 # CBC decrypt 1.16 0.93 0.74
142 # Well, given 3x column it's probably inappropriate to call the limit
143 # asymptotic, if it can be surpassed, isn't it? What happens there?
144 # Rewind to CBC paragraph for the answer. Yes, out-of-order execution
145 # magic is responsible for this. Processor overlaps not only the
146 # additional instructions with AES ones, but even AES instructions
147 # processing adjacent triplets of independent blocks. In the 6x case
148 # additional instructions still claim disproportionally small amount
149 # of additional cycles, but in 8x case number of instructions must be
150 # a tad too high for out-of-order logic to cope with, and AES unit
151 # remains underutilized... As you can see 8x interleave is hardly
152 # justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
153 # utilizes 6x interleave because of limited register bank capacity.
155 # Higher interleave factors do have negative impact on Westmere
156 # performance. While for ECB mode it's negligible ~1.5%, other
157 # parallelizables perform ~5% worse, which is outweighed by ~25%
158 # improvement on Sandy Bridge. To balance regression on Westmere
159 # CTR mode was implemented with 6x aesenc interleave factor.
163 # Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
164 # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
165 # in CTR mode AES instruction interleave factor was chosen to be 6x.
169 # Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
172 ######################################################################
173 # Current large-block performance in cycles per byte processed with
174 # 128-bit key (less is better).
176 # CBC en-/decrypt CTR XTS ECB OCB
177 # Westmere 3.77/1.25 1.25 1.25 1.26
178 # * Bridge 5.07/0.74 0.75 0.90 0.85 0.98
179 # Haswell 4.44/0.63 0.63 0.73 0.63 0.70
180 # Skylake 2.62/0.63 0.63 0.63 0.63
181 # Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11
182 # Knights L 2.54/0.77 0.78 0.85 - 1.50
183 # Goldmont 3.82/1.26 1.26 1.29 1.29 1.50
184 # Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95
185 # Ryzen 2.71/0.35 0.35 0.44 0.38 0.49
187 # (*) Atom Silvermont ECB result is suboptimal because of penalties
188 # incurred by operations on %xmm8-15. As ECB is not considered
189 # critical, nothing was done to mitigate the problem.
191 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
192 # generates drop-in replacement for
193 # crypto/aes/asm/aes-x86_64.pl:-)
197 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
199 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
201 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
202 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
203 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
204 die "can't locate x86_64-xlate.pl";
206 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
209 $movkey = $PREFIX eq "aesni" ? "movups" : "movups";
210 @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
211 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
214 $code.=".extern OPENSSL_ia32cap_P\n";
216 $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
217 # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
221 $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
222 $ivp="%r8"; # cbc, ctr, ...
224 $rnds_="%r10d"; # backup copy for $rounds
225 $key_="%r11"; # backup copy for $key
227 # %xmm register layout
228 $rndkey0="%xmm0"; $rndkey1="%xmm1";
229 $inout0="%xmm2"; $inout1="%xmm3";
230 $inout2="%xmm4"; $inout3="%xmm5";
231 $inout4="%xmm6"; $inout5="%xmm7";
232 $inout6="%xmm8"; $inout7="%xmm9";
234 $in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
235 $in0="%xmm8"; $iv="%xmm9";
237 # Inline version of internal aesni_[en|de]crypt1.
239 # Why folded loop? Because aes[enc|dec] is slow enough to accommodate
240 # cycles which take care of loop variables...
242 sub aesni_generate1 {
243 my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
246 $movkey ($key),$rndkey0
247 $movkey 16($key),$rndkey1
249 $code.=<<___ if (defined($ivec));
254 $code.=<<___ if (!defined($ivec));
256 xorps $rndkey0,$inout
260 aes${p} $rndkey1,$inout
262 $movkey ($key),$rndkey1
264 jnz .Loop_${p}1_$sn # loop body is 16 bytes
265 aes${p}last $rndkey1,$inout
268 # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
270 { my ($inp,$out,$key) = @_4args;
273 .globl ${PREFIX}_encrypt
274 .type ${PREFIX}_encrypt,\@abi-omnipotent
278 movups ($inp),$inout0 # load input
279 mov 240($key),$rounds # key->rounds
281 &aesni_generate1("enc",$key,$rounds);
283 pxor $rndkey0,$rndkey0 # clear register bank
284 pxor $rndkey1,$rndkey1
285 movups $inout0,($out) # output
289 .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
291 .globl ${PREFIX}_decrypt
292 .type ${PREFIX}_decrypt,\@abi-omnipotent
296 movups ($inp),$inout0 # load input
297 mov 240($key),$rounds # key->rounds
299 &aesni_generate1("dec",$key,$rounds);
301 pxor $rndkey0,$rndkey0 # clear register bank
302 pxor $rndkey1,$rndkey1
303 movups $inout0,($out) # output
307 .size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
311 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
312 # factor. Why 3x subroutine were originally used in loops? Even though
313 # aes[enc|dec] latency was originally 6, it could be scheduled only
314 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
315 # utilization, i.e. when subroutine's throughput is virtually same as
316 # of non-interleaved subroutine [for number of input blocks up to 3].
317 # This is why it originally made no sense to implement 2x subroutine.
318 # But times change and it became appropriate to spend extra 192 bytes
319 # on 2x subroutine on Atom Silvermont account. For processors that
320 # can schedule aes[enc|dec] every cycle optimal interleave factor
321 # equals to corresponding instructions latency. 8x is optimal for
322 # * Bridge and "super-optimal" for other Intel CPUs...
324 sub aesni_generate2 {
326 # As already mentioned it takes in $key and $rounds, which are *not*
327 # preserved. $inout[0-1] is cipher/clear text...
329 .type _aesni_${dir}rypt2,\@abi-omnipotent
333 $movkey ($key),$rndkey0
335 $movkey 16($key),$rndkey1
336 xorps $rndkey0,$inout0
337 xorps $rndkey0,$inout1
338 $movkey 32($key),$rndkey0
339 lea 32($key,$rounds),$key
344 aes${dir} $rndkey1,$inout0
345 aes${dir} $rndkey1,$inout1
346 $movkey ($key,%rax),$rndkey1
348 aes${dir} $rndkey0,$inout0
349 aes${dir} $rndkey0,$inout1
350 $movkey -16($key,%rax),$rndkey0
353 aes${dir} $rndkey1,$inout0
354 aes${dir} $rndkey1,$inout1
355 aes${dir}last $rndkey0,$inout0
356 aes${dir}last $rndkey0,$inout1
359 .size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
362 sub aesni_generate3 {
364 # As already mentioned it takes in $key and $rounds, which are *not*
365 # preserved. $inout[0-2] is cipher/clear text...
367 .type _aesni_${dir}rypt3,\@abi-omnipotent
371 $movkey ($key),$rndkey0
373 $movkey 16($key),$rndkey1
374 xorps $rndkey0,$inout0
375 xorps $rndkey0,$inout1
376 xorps $rndkey0,$inout2
377 $movkey 32($key),$rndkey0
378 lea 32($key,$rounds),$key
383 aes${dir} $rndkey1,$inout0
384 aes${dir} $rndkey1,$inout1
385 aes${dir} $rndkey1,$inout2
386 $movkey ($key,%rax),$rndkey1
388 aes${dir} $rndkey0,$inout0
389 aes${dir} $rndkey0,$inout1
390 aes${dir} $rndkey0,$inout2
391 $movkey -16($key,%rax),$rndkey0
394 aes${dir} $rndkey1,$inout0
395 aes${dir} $rndkey1,$inout1
396 aes${dir} $rndkey1,$inout2
397 aes${dir}last $rndkey0,$inout0
398 aes${dir}last $rndkey0,$inout1
399 aes${dir}last $rndkey0,$inout2
402 .size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
405 # 4x interleave is implemented to improve small block performance,
406 # most notably [and naturally] 4 block by ~30%. One can argue that one
407 # should have implemented 5x as well, but improvement would be <20%,
408 # so it's not worth it...
409 sub aesni_generate4 {
411 # As already mentioned it takes in $key and $rounds, which are *not*
412 # preserved. $inout[0-3] is cipher/clear text...
414 .type _aesni_${dir}rypt4,\@abi-omnipotent
418 $movkey ($key),$rndkey0
420 $movkey 16($key),$rndkey1
421 xorps $rndkey0,$inout0
422 xorps $rndkey0,$inout1
423 xorps $rndkey0,$inout2
424 xorps $rndkey0,$inout3
425 $movkey 32($key),$rndkey0
426 lea 32($key,$rounds),$key
432 aes${dir} $rndkey1,$inout0
433 aes${dir} $rndkey1,$inout1
434 aes${dir} $rndkey1,$inout2
435 aes${dir} $rndkey1,$inout3
436 $movkey ($key,%rax),$rndkey1
438 aes${dir} $rndkey0,$inout0
439 aes${dir} $rndkey0,$inout1
440 aes${dir} $rndkey0,$inout2
441 aes${dir} $rndkey0,$inout3
442 $movkey -16($key,%rax),$rndkey0
445 aes${dir} $rndkey1,$inout0
446 aes${dir} $rndkey1,$inout1
447 aes${dir} $rndkey1,$inout2
448 aes${dir} $rndkey1,$inout3
449 aes${dir}last $rndkey0,$inout0
450 aes${dir}last $rndkey0,$inout1
451 aes${dir}last $rndkey0,$inout2
452 aes${dir}last $rndkey0,$inout3
455 .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
458 sub aesni_generate6 {
460 # As already mentioned it takes in $key and $rounds, which are *not*
461 # preserved. $inout[0-5] is cipher/clear text...
463 .type _aesni_${dir}rypt6,\@abi-omnipotent
467 $movkey ($key),$rndkey0
469 $movkey 16($key),$rndkey1
470 xorps $rndkey0,$inout0
471 pxor $rndkey0,$inout1
472 pxor $rndkey0,$inout2
473 aes${dir} $rndkey1,$inout0
474 lea 32($key,$rounds),$key
476 aes${dir} $rndkey1,$inout1
477 pxor $rndkey0,$inout3
478 pxor $rndkey0,$inout4
479 aes${dir} $rndkey1,$inout2
480 pxor $rndkey0,$inout5
481 $movkey ($key,%rax),$rndkey0
483 jmp .L${dir}_loop6_enter
486 aes${dir} $rndkey1,$inout0
487 aes${dir} $rndkey1,$inout1
488 aes${dir} $rndkey1,$inout2
489 .L${dir}_loop6_enter:
490 aes${dir} $rndkey1,$inout3
491 aes${dir} $rndkey1,$inout4
492 aes${dir} $rndkey1,$inout5
493 $movkey ($key,%rax),$rndkey1
495 aes${dir} $rndkey0,$inout0
496 aes${dir} $rndkey0,$inout1
497 aes${dir} $rndkey0,$inout2
498 aes${dir} $rndkey0,$inout3
499 aes${dir} $rndkey0,$inout4
500 aes${dir} $rndkey0,$inout5
501 $movkey -16($key,%rax),$rndkey0
504 aes${dir} $rndkey1,$inout0
505 aes${dir} $rndkey1,$inout1
506 aes${dir} $rndkey1,$inout2
507 aes${dir} $rndkey1,$inout3
508 aes${dir} $rndkey1,$inout4
509 aes${dir} $rndkey1,$inout5
510 aes${dir}last $rndkey0,$inout0
511 aes${dir}last $rndkey0,$inout1
512 aes${dir}last $rndkey0,$inout2
513 aes${dir}last $rndkey0,$inout3
514 aes${dir}last $rndkey0,$inout4
515 aes${dir}last $rndkey0,$inout5
518 .size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
521 sub aesni_generate8 {
523 # As already mentioned it takes in $key and $rounds, which are *not*
524 # preserved. $inout[0-7] is cipher/clear text...
526 .type _aesni_${dir}rypt8,\@abi-omnipotent
530 $movkey ($key),$rndkey0
532 $movkey 16($key),$rndkey1
533 xorps $rndkey0,$inout0
534 xorps $rndkey0,$inout1
535 pxor $rndkey0,$inout2
536 pxor $rndkey0,$inout3
537 pxor $rndkey0,$inout4
538 lea 32($key,$rounds),$key
540 aes${dir} $rndkey1,$inout0
541 pxor $rndkey0,$inout5
542 pxor $rndkey0,$inout6
543 aes${dir} $rndkey1,$inout1
544 pxor $rndkey0,$inout7
545 $movkey ($key,%rax),$rndkey0
547 jmp .L${dir}_loop8_inner
550 aes${dir} $rndkey1,$inout0
551 aes${dir} $rndkey1,$inout1
552 .L${dir}_loop8_inner:
553 aes${dir} $rndkey1,$inout2
554 aes${dir} $rndkey1,$inout3
555 aes${dir} $rndkey1,$inout4
556 aes${dir} $rndkey1,$inout5
557 aes${dir} $rndkey1,$inout6
558 aes${dir} $rndkey1,$inout7
559 .L${dir}_loop8_enter:
560 $movkey ($key,%rax),$rndkey1
562 aes${dir} $rndkey0,$inout0
563 aes${dir} $rndkey0,$inout1
564 aes${dir} $rndkey0,$inout2
565 aes${dir} $rndkey0,$inout3
566 aes${dir} $rndkey0,$inout4
567 aes${dir} $rndkey0,$inout5
568 aes${dir} $rndkey0,$inout6
569 aes${dir} $rndkey0,$inout7
570 $movkey -16($key,%rax),$rndkey0
573 aes${dir} $rndkey1,$inout0
574 aes${dir} $rndkey1,$inout1
575 aes${dir} $rndkey1,$inout2
576 aes${dir} $rndkey1,$inout3
577 aes${dir} $rndkey1,$inout4
578 aes${dir} $rndkey1,$inout5
579 aes${dir} $rndkey1,$inout6
580 aes${dir} $rndkey1,$inout7
581 aes${dir}last $rndkey0,$inout0
582 aes${dir}last $rndkey0,$inout1
583 aes${dir}last $rndkey0,$inout2
584 aes${dir}last $rndkey0,$inout3
585 aes${dir}last $rndkey0,$inout4
586 aes${dir}last $rndkey0,$inout5
587 aes${dir}last $rndkey0,$inout6
588 aes${dir}last $rndkey0,$inout7
591 .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
594 &aesni_generate2("enc") if ($PREFIX eq "aesni");
595 &aesni_generate2("dec");
596 &aesni_generate3("enc") if ($PREFIX eq "aesni");
597 &aesni_generate3("dec");
598 &aesni_generate4("enc") if ($PREFIX eq "aesni");
599 &aesni_generate4("dec");
600 &aesni_generate6("enc") if ($PREFIX eq "aesni");
601 &aesni_generate6("dec");
602 &aesni_generate8("enc") if ($PREFIX eq "aesni");
603 &aesni_generate8("dec");
605 if ($PREFIX eq "aesni") {
606 ########################################################################
607 # void aesni_ecb_encrypt (const void *in, void *out,
608 # size_t length, const AES_KEY *key,
611 .globl aesni_ecb_encrypt
612 .type aesni_ecb_encrypt,\@function,5
617 $code.=<<___ if ($win64);
619 movaps %xmm6,(%rsp) # offload $inout4..7
620 movaps %xmm7,0x10(%rsp)
621 movaps %xmm8,0x20(%rsp)
622 movaps %xmm9,0x30(%rsp)
626 and \$-16,$len # if ($len<16)
627 jz .Lecb_ret # return
629 mov 240($key),$rounds # key->rounds
630 $movkey ($key),$rndkey0
631 mov $key,$key_ # backup $key
632 mov $rounds,$rnds_ # backup $rounds
633 test %r8d,%r8d # 5th argument
635 #--------------------------- ECB ENCRYPT ------------------------------#
636 cmp \$0x80,$len # if ($len<8*16)
637 jb .Lecb_enc_tail # short input
639 movdqu ($inp),$inout0 # load 8 input blocks
640 movdqu 0x10($inp),$inout1
641 movdqu 0x20($inp),$inout2
642 movdqu 0x30($inp),$inout3
643 movdqu 0x40($inp),$inout4
644 movdqu 0x50($inp),$inout5
645 movdqu 0x60($inp),$inout6
646 movdqu 0x70($inp),$inout7
647 lea 0x80($inp),$inp # $inp+=8*16
648 sub \$0x80,$len # $len-=8*16 (can be zero)
649 jmp .Lecb_enc_loop8_enter
652 movups $inout0,($out) # store 8 output blocks
653 mov $key_,$key # restore $key
654 movdqu ($inp),$inout0 # load 8 input blocks
655 mov $rnds_,$rounds # restore $rounds
656 movups $inout1,0x10($out)
657 movdqu 0x10($inp),$inout1
658 movups $inout2,0x20($out)
659 movdqu 0x20($inp),$inout2
660 movups $inout3,0x30($out)
661 movdqu 0x30($inp),$inout3
662 movups $inout4,0x40($out)
663 movdqu 0x40($inp),$inout4
664 movups $inout5,0x50($out)
665 movdqu 0x50($inp),$inout5
666 movups $inout6,0x60($out)
667 movdqu 0x60($inp),$inout6
668 movups $inout7,0x70($out)
669 lea 0x80($out),$out # $out+=8*16
670 movdqu 0x70($inp),$inout7
671 lea 0x80($inp),$inp # $inp+=8*16
672 .Lecb_enc_loop8_enter:
677 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow
679 movups $inout0,($out) # store 8 output blocks
680 mov $key_,$key # restore $key
681 movups $inout1,0x10($out)
682 mov $rnds_,$rounds # restore $rounds
683 movups $inout2,0x20($out)
684 movups $inout3,0x30($out)
685 movups $inout4,0x40($out)
686 movups $inout5,0x50($out)
687 movups $inout6,0x60($out)
688 movups $inout7,0x70($out)
689 lea 0x80($out),$out # $out+=8*16
690 add \$0x80,$len # restore real remaining $len
691 jz .Lecb_ret # done if ($len==0)
693 .Lecb_enc_tail: # $len is less than 8*16
694 movups ($inp),$inout0
697 movups 0x10($inp),$inout1
699 movups 0x20($inp),$inout2
702 movups 0x30($inp),$inout3
704 movups 0x40($inp),$inout4
707 movups 0x50($inp),$inout5
709 movdqu 0x60($inp),$inout6
710 xorps $inout7,$inout7
712 movups $inout0,($out) # store 7 output blocks
713 movups $inout1,0x10($out)
714 movups $inout2,0x20($out)
715 movups $inout3,0x30($out)
716 movups $inout4,0x40($out)
717 movups $inout5,0x50($out)
718 movups $inout6,0x60($out)
723 &aesni_generate1("enc",$key,$rounds);
725 movups $inout0,($out) # store one output block
730 movups $inout0,($out) # store 2 output blocks
731 movups $inout1,0x10($out)
736 movups $inout0,($out) # store 3 output blocks
737 movups $inout1,0x10($out)
738 movups $inout2,0x20($out)
743 movups $inout0,($out) # store 4 output blocks
744 movups $inout1,0x10($out)
745 movups $inout2,0x20($out)
746 movups $inout3,0x30($out)
750 xorps $inout5,$inout5
752 movups $inout0,($out) # store 5 output blocks
753 movups $inout1,0x10($out)
754 movups $inout2,0x20($out)
755 movups $inout3,0x30($out)
756 movups $inout4,0x40($out)
761 movups $inout0,($out) # store 6 output blocks
762 movups $inout1,0x10($out)
763 movups $inout2,0x20($out)
764 movups $inout3,0x30($out)
765 movups $inout4,0x40($out)
766 movups $inout5,0x50($out)
768 \f#--------------------------- ECB DECRYPT ------------------------------#
771 cmp \$0x80,$len # if ($len<8*16)
772 jb .Lecb_dec_tail # short input
774 movdqu ($inp),$inout0 # load 8 input blocks
775 movdqu 0x10($inp),$inout1
776 movdqu 0x20($inp),$inout2
777 movdqu 0x30($inp),$inout3
778 movdqu 0x40($inp),$inout4
779 movdqu 0x50($inp),$inout5
780 movdqu 0x60($inp),$inout6
781 movdqu 0x70($inp),$inout7
782 lea 0x80($inp),$inp # $inp+=8*16
783 sub \$0x80,$len # $len-=8*16 (can be zero)
784 jmp .Lecb_dec_loop8_enter
787 movups $inout0,($out) # store 8 output blocks
788 mov $key_,$key # restore $key
789 movdqu ($inp),$inout0 # load 8 input blocks
790 mov $rnds_,$rounds # restore $rounds
791 movups $inout1,0x10($out)
792 movdqu 0x10($inp),$inout1
793 movups $inout2,0x20($out)
794 movdqu 0x20($inp),$inout2
795 movups $inout3,0x30($out)
796 movdqu 0x30($inp),$inout3
797 movups $inout4,0x40($out)
798 movdqu 0x40($inp),$inout4
799 movups $inout5,0x50($out)
800 movdqu 0x50($inp),$inout5
801 movups $inout6,0x60($out)
802 movdqu 0x60($inp),$inout6
803 movups $inout7,0x70($out)
804 lea 0x80($out),$out # $out+=8*16
805 movdqu 0x70($inp),$inout7
806 lea 0x80($inp),$inp # $inp+=8*16
807 .Lecb_dec_loop8_enter:
811 $movkey ($key_),$rndkey0
813 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow
815 movups $inout0,($out) # store 8 output blocks
816 pxor $inout0,$inout0 # clear register bank
817 mov $key_,$key # restore $key
818 movups $inout1,0x10($out)
820 mov $rnds_,$rounds # restore $rounds
821 movups $inout2,0x20($out)
823 movups $inout3,0x30($out)
825 movups $inout4,0x40($out)
827 movups $inout5,0x50($out)
829 movups $inout6,0x60($out)
831 movups $inout7,0x70($out)
833 lea 0x80($out),$out # $out+=8*16
834 add \$0x80,$len # restore real remaining $len
835 jz .Lecb_ret # done if ($len==0)
838 movups ($inp),$inout0
841 movups 0x10($inp),$inout1
843 movups 0x20($inp),$inout2
846 movups 0x30($inp),$inout3
848 movups 0x40($inp),$inout4
851 movups 0x50($inp),$inout5
853 movups 0x60($inp),$inout6
854 $movkey ($key),$rndkey0
855 xorps $inout7,$inout7
857 movups $inout0,($out) # store 7 output blocks
858 pxor $inout0,$inout0 # clear register bank
859 movups $inout1,0x10($out)
861 movups $inout2,0x20($out)
863 movups $inout3,0x30($out)
865 movups $inout4,0x40($out)
867 movups $inout5,0x50($out)
869 movups $inout6,0x60($out)
876 &aesni_generate1("dec",$key,$rounds);
878 movups $inout0,($out) # store one output block
879 pxor $inout0,$inout0 # clear register bank
884 movups $inout0,($out) # store 2 output blocks
885 pxor $inout0,$inout0 # clear register bank
886 movups $inout1,0x10($out)
892 movups $inout0,($out) # store 3 output blocks
893 pxor $inout0,$inout0 # clear register bank
894 movups $inout1,0x10($out)
896 movups $inout2,0x20($out)
902 movups $inout0,($out) # store 4 output blocks
903 pxor $inout0,$inout0 # clear register bank
904 movups $inout1,0x10($out)
906 movups $inout2,0x20($out)
908 movups $inout3,0x30($out)
913 xorps $inout5,$inout5
915 movups $inout0,($out) # store 5 output blocks
916 pxor $inout0,$inout0 # clear register bank
917 movups $inout1,0x10($out)
919 movups $inout2,0x20($out)
921 movups $inout3,0x30($out)
923 movups $inout4,0x40($out)
930 movups $inout0,($out) # store 6 output blocks
931 pxor $inout0,$inout0 # clear register bank
932 movups $inout1,0x10($out)
934 movups $inout2,0x20($out)
936 movups $inout3,0x30($out)
938 movups $inout4,0x40($out)
940 movups $inout5,0x50($out)
944 xorps $rndkey0,$rndkey0 # %xmm0
945 pxor $rndkey1,$rndkey1
947 $code.=<<___ if ($win64);
949 movaps %xmm0,(%rsp) # clear stack
950 movaps 0x10(%rsp),%xmm7
951 movaps %xmm0,0x10(%rsp)
952 movaps 0x20(%rsp),%xmm8
953 movaps %xmm0,0x20(%rsp)
954 movaps 0x30(%rsp),%xmm9
955 movaps %xmm0,0x30(%rsp)
962 .size aesni_ecb_encrypt,.-aesni_ecb_encrypt
966 ######################################################################
967 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
968 # size_t blocks, const AES_KEY *key,
969 # const char *ivec,char *cmac);
971 # Handles only complete blocks, operates on 64-bit counter and
972 # does not update *ivec! Nor does it finalize CMAC value
973 # (see engine/eng_aesni.c for details)
976 my $cmac="%r9"; # 6th argument
978 my $increment="%xmm9";
980 my $bswap_mask="%xmm7";
983 .globl aesni_ccm64_encrypt_blocks
984 .type aesni_ccm64_encrypt_blocks,\@function,6
986 aesni_ccm64_encrypt_blocks:
988 $code.=<<___ if ($win64);
990 movaps %xmm6,(%rsp) # $iv
991 movaps %xmm7,0x10(%rsp) # $bswap_mask
992 movaps %xmm8,0x20(%rsp) # $in0
993 movaps %xmm9,0x30(%rsp) # $increment
997 mov 240($key),$rounds # key->rounds
999 movdqa .Lincrement64(%rip),$increment
1000 movdqa .Lbswap_mask(%rip),$bswap_mask
1005 movdqu ($cmac),$inout1
1007 lea 32($key,$rounds),$key # end of key schedule
1008 pshufb $bswap_mask,$iv
1009 sub %rax,%r10 # twisted $rounds
1010 jmp .Lccm64_enc_outer
1013 $movkey ($key_),$rndkey0
1015 movups ($inp),$in0 # load inp
1017 xorps $rndkey0,$inout0 # counter
1018 $movkey 16($key_),$rndkey1
1020 xorps $rndkey0,$inout1 # cmac^=inp
1021 $movkey 32($key_),$rndkey0
1024 aesenc $rndkey1,$inout0
1025 aesenc $rndkey1,$inout1
1026 $movkey ($key,%rax),$rndkey1
1028 aesenc $rndkey0,$inout0
1029 aesenc $rndkey0,$inout1
1030 $movkey -16($key,%rax),$rndkey0
1031 jnz .Lccm64_enc2_loop
1032 aesenc $rndkey1,$inout0
1033 aesenc $rndkey1,$inout1
1034 paddq $increment,$iv
1035 dec $len # $len-- ($len is in blocks)
1036 aesenclast $rndkey0,$inout0
1037 aesenclast $rndkey0,$inout1
1040 xorps $inout0,$in0 # inp ^= E(iv)
1042 movups $in0,($out) # save output
1043 pshufb $bswap_mask,$inout0
1044 lea 16($out),$out # $out+=16
1045 jnz .Lccm64_enc_outer # loop if ($len!=0)
1047 pxor $rndkey0,$rndkey0 # clear register bank
1048 pxor $rndkey1,$rndkey1
1049 pxor $inout0,$inout0
1050 movups $inout1,($cmac) # store resulting mac
1051 pxor $inout1,$inout1
1055 $code.=<<___ if ($win64);
1057 movaps %xmm0,(%rsp) # clear stack
1058 movaps 0x10(%rsp),%xmm7
1059 movaps %xmm0,0x10(%rsp)
1060 movaps 0x20(%rsp),%xmm8
1061 movaps %xmm0,0x20(%rsp)
1062 movaps 0x30(%rsp),%xmm9
1063 movaps %xmm0,0x30(%rsp)
1069 .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
1071 ######################################################################
1073 .globl aesni_ccm64_decrypt_blocks
1074 .type aesni_ccm64_decrypt_blocks,\@function,6
1076 aesni_ccm64_decrypt_blocks:
1078 $code.=<<___ if ($win64);
1079 lea -0x58(%rsp),%rsp
1080 movaps %xmm6,(%rsp) # $iv
1081 movaps %xmm7,0x10(%rsp) # $bswap_mask
1082 movaps %xmm8,0x20(%rsp) # $in8
1083 movaps %xmm9,0x30(%rsp) # $increment
1087 mov 240($key),$rounds # key->rounds
1089 movdqu ($cmac),$inout1
1090 movdqa .Lincrement64(%rip),$increment
1091 movdqa .Lbswap_mask(%rip),$bswap_mask
1096 pshufb $bswap_mask,$iv
1098 &aesni_generate1("enc",$key,$rounds);
1102 movups ($inp),$in0 # load inp
1103 paddq $increment,$iv
1104 lea 16($inp),$inp # $inp+=16
1105 sub %r10,%rax # twisted $rounds
1106 lea 32($key_,$rnds_),$key # end of key schedule
1108 jmp .Lccm64_dec_outer
1111 xorps $inout0,$in0 # inp ^= E(iv)
1113 movups $in0,($out) # save output
1114 lea 16($out),$out # $out+=16
1115 pshufb $bswap_mask,$inout0
1117 sub \$1,$len # $len-- ($len is in blocks)
1118 jz .Lccm64_dec_break # if ($len==0) break
1120 $movkey ($key_),$rndkey0
1122 $movkey 16($key_),$rndkey1
1124 xorps $rndkey0,$inout0
1125 xorps $in0,$inout1 # cmac^=out
1126 $movkey 32($key_),$rndkey0
1127 jmp .Lccm64_dec2_loop
1130 aesenc $rndkey1,$inout0
1131 aesenc $rndkey1,$inout1
1132 $movkey ($key,%rax),$rndkey1
1134 aesenc $rndkey0,$inout0
1135 aesenc $rndkey0,$inout1
1136 $movkey -16($key,%rax),$rndkey0
1137 jnz .Lccm64_dec2_loop
1138 movups ($inp),$in0 # load input
1139 paddq $increment,$iv
1140 aesenc $rndkey1,$inout0
1141 aesenc $rndkey1,$inout1
1142 aesenclast $rndkey0,$inout0
1143 aesenclast $rndkey0,$inout1
1144 lea 16($inp),$inp # $inp+=16
1145 jmp .Lccm64_dec_outer
1149 #xorps $in0,$inout1 # cmac^=out
1150 mov 240($key_),$rounds
1152 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
1154 pxor $rndkey0,$rndkey0 # clear register bank
1155 pxor $rndkey1,$rndkey1
1156 pxor $inout0,$inout0
1157 movups $inout1,($cmac) # store resulting mac
1158 pxor $inout1,$inout1
1162 $code.=<<___ if ($win64);
1164 movaps %xmm0,(%rsp) # clear stack
1165 movaps 0x10(%rsp),%xmm7
1166 movaps %xmm0,0x10(%rsp)
1167 movaps 0x20(%rsp),%xmm8
1168 movaps %xmm0,0x20(%rsp)
1169 movaps 0x30(%rsp),%xmm9
1170 movaps %xmm0,0x30(%rsp)
1176 .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
1179 ######################################################################
1180 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1181 # size_t blocks, const AES_KEY *key,
1182 # const char *ivec);
1184 # Handles only complete blocks, operates on 32-bit counter and
1185 # does not update *ivec! (see crypto/modes/ctr128.c for details)
1187 # Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
1188 # http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
1189 # Keywords are full unroll and modulo-schedule counter calculations
1190 # with zero-round key xor.
1192 my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
1193 my ($key0,$ctr)=("%ebp","${ivp}d");
1194 my $frame_size = 0x80 + ($win64?160:0);
1197 .globl aesni_ctr32_encrypt_blocks
1198 .type aesni_ctr32_encrypt_blocks,\@function,5
1200 aesni_ctr32_encrypt_blocks:
1205 # handle single block without allocating stack frame,
1206 # useful when handling edges
1207 movups ($ivp),$inout0
1208 movups ($inp),$inout1
1209 mov 240($key),%edx # key->rounds
1211 &aesni_generate1("enc",$key,"%edx");
1213 pxor $rndkey0,$rndkey0 # clear register bank
1214 pxor $rndkey1,$rndkey1
1215 xorps $inout1,$inout0
1216 pxor $inout1,$inout1
1217 movups $inout0,($out)
1218 xorps $inout0,$inout0
1219 jmp .Lctr32_epilogue
1223 lea (%rsp),$key_ # use $key_ as frame pointer
1224 .cfi_def_cfa_register $key_
1227 sub \$$frame_size,%rsp
1228 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
1230 $code.=<<___ if ($win64);
1231 movaps %xmm6,-0xa8($key_) # offload everything
1232 movaps %xmm7,-0x98($key_)
1233 movaps %xmm8,-0x88($key_)
1234 movaps %xmm9,-0x78($key_)
1235 movaps %xmm10,-0x68($key_)
1236 movaps %xmm11,-0x58($key_)
1237 movaps %xmm12,-0x48($key_)
1238 movaps %xmm13,-0x38($key_)
1239 movaps %xmm14,-0x28($key_)
1240 movaps %xmm15,-0x18($key_)
1245 # 8 16-byte words on top of stack are counter values
1246 # xor-ed with zero-round key
1248 movdqu ($ivp),$inout0
1249 movdqu ($key),$rndkey0
1250 mov 12($ivp),$ctr # counter LSB
1251 pxor $rndkey0,$inout0
1252 mov 12($key),$key0 # 0-round key LSB
1253 movdqa $inout0,0x00(%rsp) # populate counter block
1255 movdqa $inout0,$inout1
1256 movdqa $inout0,$inout2
1257 movdqa $inout0,$inout3
1258 movdqa $inout0,0x40(%rsp)
1259 movdqa $inout0,0x50(%rsp)
1260 movdqa $inout0,0x60(%rsp)
1261 mov %rdx,%r10 # about to borrow %rdx
1262 movdqa $inout0,0x70(%rsp)
1270 pinsrd \$3,%eax,$inout1
1272 movdqa $inout1,0x10(%rsp)
1273 pinsrd \$3,%edx,$inout2
1275 mov %r10,%rdx # restore %rdx
1277 movdqa $inout2,0x20(%rsp)
1280 pinsrd \$3,%eax,$inout3
1282 movdqa $inout3,0x30(%rsp)
1284 mov %r10d,0x40+12(%rsp)
1287 mov 240($key),$rounds # key->rounds
1290 mov %r9d,0x50+12(%rsp)
1293 mov %r10d,0x60+12(%rsp)
1295 mov OPENSSL_ia32cap_P+4(%rip),%r10d
1297 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
1298 mov %r9d,0x70+12(%rsp)
1300 $movkey 0x10($key),$rndkey1
1302 movdqa 0x40(%rsp),$inout4
1303 movdqa 0x50(%rsp),$inout5
1305 cmp \$8,$len # $len is in blocks
1306 jb .Lctr32_tail # short input if ($len<8)
1308 sub \$6,$len # $len is biased by -6
1309 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
1310 je .Lctr32_6x # [which denotes Atom Silvermont]
1312 lea 0x80($key),$key # size optimization
1313 sub \$2,$len # $len is biased by -8
1321 lea 32($key,$rounds),$key # end of key schedule
1322 sub %rax,%r10 # twisted $rounds
1327 add \$6,$ctr # next counter value
1328 $movkey -48($key,$rnds_),$rndkey0
1329 aesenc $rndkey1,$inout0
1332 aesenc $rndkey1,$inout1
1333 movbe %eax,`0x00+12`(%rsp) # store next counter value
1335 aesenc $rndkey1,$inout2
1337 movbe %eax,`0x10+12`(%rsp)
1338 aesenc $rndkey1,$inout3
1341 aesenc $rndkey1,$inout4
1342 movbe %eax,`0x20+12`(%rsp)
1344 aesenc $rndkey1,$inout5
1345 $movkey -32($key,$rnds_),$rndkey1
1348 aesenc $rndkey0,$inout0
1349 movbe %eax,`0x30+12`(%rsp)
1351 aesenc $rndkey0,$inout1
1353 movbe %eax,`0x40+12`(%rsp)
1354 aesenc $rndkey0,$inout2
1357 aesenc $rndkey0,$inout3
1358 movbe %eax,`0x50+12`(%rsp)
1359 mov %r10,%rax # mov $rnds_,$rounds
1360 aesenc $rndkey0,$inout4
1361 aesenc $rndkey0,$inout5
1362 $movkey -16($key,$rnds_),$rndkey0
1366 movdqu ($inp),$inout6 # load 6 input blocks
1367 movdqu 0x10($inp),$inout7
1368 movdqu 0x20($inp),$in0
1369 movdqu 0x30($inp),$in1
1370 movdqu 0x40($inp),$in2
1371 movdqu 0x50($inp),$in3
1372 lea 0x60($inp),$inp # $inp+=6*16
1373 $movkey -64($key,$rnds_),$rndkey1
1374 pxor $inout0,$inout6 # inp^=E(ctr)
1375 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
1376 pxor $inout1,$inout7
1377 movaps 0x10(%rsp),$inout1
1379 movaps 0x20(%rsp),$inout2
1381 movaps 0x30(%rsp),$inout3
1383 movaps 0x40(%rsp),$inout4
1385 movaps 0x50(%rsp),$inout5
1386 movdqu $inout6,($out) # store 6 output blocks
1387 movdqu $inout7,0x10($out)
1388 movdqu $in0,0x20($out)
1389 movdqu $in1,0x30($out)
1390 movdqu $in2,0x40($out)
1391 movdqu $in3,0x50($out)
1392 lea 0x60($out),$out # $out+=6*16
1395 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
1397 add \$6,$len # restore real remaining $len
1398 jz .Lctr32_done # done if ($len==0)
1400 lea -48($rnds_),$rounds
1401 lea -80($key,$rnds_),$key # restore $key
1403 shr \$4,$rounds # restore $rounds
1408 add \$8,$ctr # next counter value
1409 movdqa 0x60(%rsp),$inout6
1410 aesenc $rndkey1,$inout0
1412 movdqa 0x70(%rsp),$inout7
1413 aesenc $rndkey1,$inout1
1415 $movkey 0x20-0x80($key),$rndkey0
1416 aesenc $rndkey1,$inout2
1419 aesenc $rndkey1,$inout3
1420 mov %r9d,0x00+12(%rsp) # store next counter value
1422 aesenc $rndkey1,$inout4
1423 aesenc $rndkey1,$inout5
1424 aesenc $rndkey1,$inout6
1425 aesenc $rndkey1,$inout7
1426 $movkey 0x30-0x80($key),$rndkey1
1428 for($i=2;$i<8;$i++) {
1429 my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1432 aesenc $rndkeyx,$inout0
1433 aesenc $rndkeyx,$inout1
1436 aesenc $rndkeyx,$inout2
1437 aesenc $rndkeyx,$inout3
1438 mov %r9d,`0x10*($i-1)`+12(%rsp)
1440 aesenc $rndkeyx,$inout4
1441 aesenc $rndkeyx,$inout5
1442 aesenc $rndkeyx,$inout6
1443 aesenc $rndkeyx,$inout7
1444 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
1449 aesenc $rndkey0,$inout0
1450 aesenc $rndkey0,$inout1
1451 aesenc $rndkey0,$inout2
1453 movdqu 0x00($inp),$in0 # start loading input
1454 aesenc $rndkey0,$inout3
1455 mov %r9d,0x70+12(%rsp)
1457 aesenc $rndkey0,$inout4
1458 aesenc $rndkey0,$inout5
1459 aesenc $rndkey0,$inout6
1460 aesenc $rndkey0,$inout7
1461 $movkey 0xa0-0x80($key),$rndkey0
1465 aesenc $rndkey1,$inout0
1466 aesenc $rndkey1,$inout1
1467 aesenc $rndkey1,$inout2
1468 aesenc $rndkey1,$inout3
1469 aesenc $rndkey1,$inout4
1470 aesenc $rndkey1,$inout5
1471 aesenc $rndkey1,$inout6
1472 aesenc $rndkey1,$inout7
1473 $movkey 0xb0-0x80($key),$rndkey1
1475 aesenc $rndkey0,$inout0
1476 aesenc $rndkey0,$inout1
1477 aesenc $rndkey0,$inout2
1478 aesenc $rndkey0,$inout3
1479 aesenc $rndkey0,$inout4
1480 aesenc $rndkey0,$inout5
1481 aesenc $rndkey0,$inout6
1482 aesenc $rndkey0,$inout7
1483 $movkey 0xc0-0x80($key),$rndkey0
1486 aesenc $rndkey1,$inout0
1487 aesenc $rndkey1,$inout1
1488 aesenc $rndkey1,$inout2
1489 aesenc $rndkey1,$inout3
1490 aesenc $rndkey1,$inout4
1491 aesenc $rndkey1,$inout5
1492 aesenc $rndkey1,$inout6
1493 aesenc $rndkey1,$inout7
1494 $movkey 0xd0-0x80($key),$rndkey1
1496 aesenc $rndkey0,$inout0
1497 aesenc $rndkey0,$inout1
1498 aesenc $rndkey0,$inout2
1499 aesenc $rndkey0,$inout3
1500 aesenc $rndkey0,$inout4
1501 aesenc $rndkey0,$inout5
1502 aesenc $rndkey0,$inout6
1503 aesenc $rndkey0,$inout7
1504 $movkey 0xe0-0x80($key),$rndkey0
1505 jmp .Lctr32_enc_done
1509 movdqu 0x10($inp),$in1
1510 pxor $rndkey0,$in0 # input^=round[last]
1511 movdqu 0x20($inp),$in2
1513 movdqu 0x30($inp),$in3
1515 movdqu 0x40($inp),$in4
1517 movdqu 0x50($inp),$in5
1520 aesenc $rndkey1,$inout0
1521 aesenc $rndkey1,$inout1
1522 aesenc $rndkey1,$inout2
1523 aesenc $rndkey1,$inout3
1524 aesenc $rndkey1,$inout4
1525 aesenc $rndkey1,$inout5
1526 aesenc $rndkey1,$inout6
1527 aesenc $rndkey1,$inout7
1528 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6]
1529 lea 0x80($inp),$inp # $inp+=8*16
1531 aesenclast $in0,$inout0 # $inN is inp[N]^round[last]
1532 pxor $rndkey0,$rndkey1 # borrowed $rndkey
1533 movdqu 0x70-0x80($inp),$in0
1534 aesenclast $in1,$inout1
1536 movdqa 0x00(%rsp),$in1 # load next counter block
1537 aesenclast $in2,$inout2
1538 aesenclast $in3,$inout3
1539 movdqa 0x10(%rsp),$in2
1540 movdqa 0x20(%rsp),$in3
1541 aesenclast $in4,$inout4
1542 aesenclast $in5,$inout5
1543 movdqa 0x30(%rsp),$in4
1544 movdqa 0x40(%rsp),$in5
1545 aesenclast $rndkey1,$inout6
1546 movdqa 0x50(%rsp),$rndkey0
1547 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key
1548 aesenclast $in0,$inout7
1550 movups $inout0,($out) # store 8 output blocks
1552 movups $inout1,0x10($out)
1554 movups $inout2,0x20($out)
1556 movups $inout3,0x30($out)
1558 movups $inout4,0x40($out)
1560 movups $inout5,0x50($out)
1561 movdqa $rndkey0,$inout5
1562 movups $inout6,0x60($out)
1563 movups $inout7,0x70($out)
1564 lea 0x80($out),$out # $out+=8*16
1567 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
1569 add \$8,$len # restore real remaining $len
1570 jz .Lctr32_done # done if ($len==0)
1571 lea -0x80($key),$key
1574 # note that at this point $inout0..5 are populated with
1575 # counter values xor-ed with 0-round key
1581 # if ($len>4) compute 7 E(counter)
1583 movdqa 0x60(%rsp),$inout6
1584 pxor $inout7,$inout7
1586 $movkey 16($key),$rndkey0
1587 aesenc $rndkey1,$inout0
1588 aesenc $rndkey1,$inout1
1589 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
1591 aesenc $rndkey1,$inout2
1592 add \$16,%rax # prepare for .Lenc_loop8_enter
1594 aesenc $rndkey1,$inout3
1595 aesenc $rndkey1,$inout4
1596 movups 0x10($inp),$in1 # pre-load input
1597 movups 0x20($inp),$in2
1598 aesenc $rndkey1,$inout5
1599 aesenc $rndkey1,$inout6
1601 call .Lenc_loop8_enter
1603 movdqu 0x30($inp),$in3
1605 movdqu 0x40($inp),$in0
1607 movdqu $inout0,($out) # store output
1609 movdqu $inout1,0x10($out)
1611 movdqu $inout2,0x20($out)
1613 movdqu $inout3,0x30($out)
1614 movdqu $inout4,0x40($out)
1616 jb .Lctr32_done # $len was 5, stop store
1618 movups 0x50($inp),$in1
1620 movups $inout5,0x50($out)
1621 je .Lctr32_done # $len was 6, stop store
1623 movups 0x60($inp),$in2
1625 movups $inout6,0x60($out)
1626 jmp .Lctr32_done # $len was 7, stop store
1630 aesenc $rndkey1,$inout0
1633 aesenc $rndkey1,$inout1
1634 aesenc $rndkey1,$inout2
1635 aesenc $rndkey1,$inout3
1636 $movkey ($key),$rndkey1
1638 aesenclast $rndkey1,$inout0
1639 aesenclast $rndkey1,$inout1
1640 movups ($inp),$in0 # load input
1641 movups 0x10($inp),$in1
1642 aesenclast $rndkey1,$inout2
1643 aesenclast $rndkey1,$inout3
1644 movups 0x20($inp),$in2
1645 movups 0x30($inp),$in3
1648 movups $inout0,($out) # store output
1650 movups $inout1,0x10($out)
1652 movdqu $inout2,0x20($out)
1654 movdqu $inout3,0x30($out)
1655 jmp .Lctr32_done # $len was 4, stop store
1659 aesenc $rndkey1,$inout0
1662 aesenc $rndkey1,$inout1
1663 aesenc $rndkey1,$inout2
1664 $movkey ($key),$rndkey1
1666 aesenclast $rndkey1,$inout0
1667 aesenclast $rndkey1,$inout1
1668 aesenclast $rndkey1,$inout2
1670 movups ($inp),$in0 # load input
1672 movups $inout0,($out) # store output
1674 jb .Lctr32_done # $len was 1, stop store
1676 movups 0x10($inp),$in1
1678 movups $inout1,0x10($out)
1679 je .Lctr32_done # $len was 2, stop store
1681 movups 0x20($inp),$in2
1683 movups $inout2,0x20($out) # $len was 3, stop store
1686 xorps %xmm0,%xmm0 # clear register bank
1694 $code.=<<___ if (!$win64);
1697 movaps %xmm0,0x00(%rsp) # clear stack
1699 movaps %xmm0,0x10(%rsp)
1701 movaps %xmm0,0x20(%rsp)
1703 movaps %xmm0,0x30(%rsp)
1705 movaps %xmm0,0x40(%rsp)
1707 movaps %xmm0,0x50(%rsp)
1709 movaps %xmm0,0x60(%rsp)
1711 movaps %xmm0,0x70(%rsp)
1714 $code.=<<___ if ($win64);
1715 movaps -0xa8($key_),%xmm6
1716 movaps %xmm0,-0xa8($key_) # clear stack
1717 movaps -0x98($key_),%xmm7
1718 movaps %xmm0,-0x98($key_)
1719 movaps -0x88($key_),%xmm8
1720 movaps %xmm0,-0x88($key_)
1721 movaps -0x78($key_),%xmm9
1722 movaps %xmm0,-0x78($key_)
1723 movaps -0x68($key_),%xmm10
1724 movaps %xmm0,-0x68($key_)
1725 movaps -0x58($key_),%xmm11
1726 movaps %xmm0,-0x58($key_)
1727 movaps -0x48($key_),%xmm12
1728 movaps %xmm0,-0x48($key_)
1729 movaps -0x38($key_),%xmm13
1730 movaps %xmm0,-0x38($key_)
1731 movaps -0x28($key_),%xmm14
1732 movaps %xmm0,-0x28($key_)
1733 movaps -0x18($key_),%xmm15
1734 movaps %xmm0,-0x18($key_)
1735 movaps %xmm0,0x00(%rsp)
1736 movaps %xmm0,0x10(%rsp)
1737 movaps %xmm0,0x20(%rsp)
1738 movaps %xmm0,0x30(%rsp)
1739 movaps %xmm0,0x40(%rsp)
1740 movaps %xmm0,0x50(%rsp)
1741 movaps %xmm0,0x60(%rsp)
1742 movaps %xmm0,0x70(%rsp)
1748 .cfi_def_cfa_register %rsp
1752 .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1756 ######################################################################
1757 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1758 # const AES_KEY *key1, const AES_KEY *key2
1759 # const unsigned char iv[16]);
1762 my @tweak=map("%xmm$_",(10..15));
1763 my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1764 my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1765 my $frame_size = 0x70 + ($win64?160:0);
1766 my $key_ = "%rbp"; # override so that we can use %r11 as FP
1769 .globl aesni_xts_encrypt
1770 .type aesni_xts_encrypt,\@function,6
1774 lea (%rsp),%r11 # frame pointer
1775 .cfi_def_cfa_register %r11
1778 sub \$$frame_size,%rsp
1779 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
1781 $code.=<<___ if ($win64);
1782 movaps %xmm6,-0xa8(%r11) # offload everything
1783 movaps %xmm7,-0x98(%r11)
1784 movaps %xmm8,-0x88(%r11)
1785 movaps %xmm9,-0x78(%r11)
1786 movaps %xmm10,-0x68(%r11)
1787 movaps %xmm11,-0x58(%r11)
1788 movaps %xmm12,-0x48(%r11)
1789 movaps %xmm13,-0x38(%r11)
1790 movaps %xmm14,-0x28(%r11)
1791 movaps %xmm15,-0x18(%r11)
1795 movups ($ivp),$inout0 # load clear-text tweak
1796 mov 240(%r8),$rounds # key2->rounds
1797 mov 240($key),$rnds_ # key1->rounds
1799 # generate the tweak
1800 &aesni_generate1("enc",$key2,$rounds,$inout0);
1802 $movkey ($key),$rndkey0 # zero round key
1803 mov $key,$key_ # backup $key
1804 mov $rnds_,$rounds # backup $rounds
1806 mov $len,$len_ # backup $len
1809 $movkey 16($key,$rnds_),$rndkey1 # last round key
1811 movdqa .Lxts_magic(%rip),$twmask
1812 movdqa $inout0,@tweak[5]
1813 pshufd \$0x5f,$inout0,$twres
1814 pxor $rndkey0,$rndkey1
1816 # alternative tweak calculation algorithm is based on suggestions
1817 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
1818 # and should help in the future...
1819 for ($i=0;$i<4;$i++) {
1821 movdqa $twres,$twtmp
1823 movdqa @tweak[5],@tweak[$i]
1824 psrad \$31,$twtmp # broadcast upper bits
1825 paddq @tweak[5],@tweak[5]
1827 pxor $rndkey0,@tweak[$i]
1828 pxor $twtmp,@tweak[5]
1832 movdqa @tweak[5],@tweak[4]
1834 paddq @tweak[5],@tweak[5]
1836 pxor $rndkey0,@tweak[4]
1837 pxor $twres,@tweak[5]
1838 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
1841 jc .Lxts_enc_short # if $len-=6*16 borrowed
1844 lea 32($key_,$rnds_),$key # end of key schedule
1845 sub %r10,%rax # twisted $rounds
1846 $movkey 16($key_),$rndkey1
1847 mov %rax,%r10 # backup twisted $rounds
1848 lea .Lxts_magic(%rip),%r8
1849 jmp .Lxts_enc_grandloop
1852 .Lxts_enc_grandloop:
1853 movdqu `16*0`($inp),$inout0 # load input
1854 movdqa $rndkey0,$twmask
1855 movdqu `16*1`($inp),$inout1
1856 pxor @tweak[0],$inout0 # input^=tweak^round[0]
1857 movdqu `16*2`($inp),$inout2
1858 pxor @tweak[1],$inout1
1859 aesenc $rndkey1,$inout0
1860 movdqu `16*3`($inp),$inout3
1861 pxor @tweak[2],$inout2
1862 aesenc $rndkey1,$inout1
1863 movdqu `16*4`($inp),$inout4
1864 pxor @tweak[3],$inout3
1865 aesenc $rndkey1,$inout2
1866 movdqu `16*5`($inp),$inout5
1867 pxor @tweak[5],$twmask # round[0]^=tweak[5]
1868 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
1869 pxor @tweak[4],$inout4
1870 aesenc $rndkey1,$inout3
1871 $movkey 32($key_),$rndkey0
1872 lea `16*6`($inp),$inp
1873 pxor $twmask,$inout5
1875 pxor $twres,@tweak[0] # calculate tweaks^round[last]
1876 aesenc $rndkey1,$inout4
1877 pxor $twres,@tweak[1]
1878 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last]
1879 aesenc $rndkey1,$inout5
1880 $movkey 48($key_),$rndkey1
1881 pxor $twres,@tweak[2]
1883 aesenc $rndkey0,$inout0
1884 pxor $twres,@tweak[3]
1885 movdqa @tweak[1],`16*1`(%rsp)
1886 aesenc $rndkey0,$inout1
1887 pxor $twres,@tweak[4]
1888 movdqa @tweak[2],`16*2`(%rsp)
1889 aesenc $rndkey0,$inout2
1890 aesenc $rndkey0,$inout3
1892 movdqa @tweak[4],`16*4`(%rsp)
1893 aesenc $rndkey0,$inout4
1894 aesenc $rndkey0,$inout5
1895 $movkey 64($key_),$rndkey0
1896 movdqa $twmask,`16*5`(%rsp)
1897 pshufd \$0x5f,@tweak[5],$twres
1901 aesenc $rndkey1,$inout0
1902 aesenc $rndkey1,$inout1
1903 aesenc $rndkey1,$inout2
1904 aesenc $rndkey1,$inout3
1905 aesenc $rndkey1,$inout4
1906 aesenc $rndkey1,$inout5
1907 $movkey -64($key,%rax),$rndkey1
1910 aesenc $rndkey0,$inout0
1911 aesenc $rndkey0,$inout1
1912 aesenc $rndkey0,$inout2
1913 aesenc $rndkey0,$inout3
1914 aesenc $rndkey0,$inout4
1915 aesenc $rndkey0,$inout5
1916 $movkey -80($key,%rax),$rndkey0
1919 movdqa (%r8),$twmask # start calculating next tweak
1920 movdqa $twres,$twtmp
1922 aesenc $rndkey1,$inout0
1923 paddq @tweak[5],@tweak[5]
1925 aesenc $rndkey1,$inout1
1927 $movkey ($key_),@tweak[0] # load round[0]
1928 aesenc $rndkey1,$inout2
1929 aesenc $rndkey1,$inout3
1930 aesenc $rndkey1,$inout4
1931 pxor $twtmp,@tweak[5]
1932 movaps @tweak[0],@tweak[1] # copy round[0]
1933 aesenc $rndkey1,$inout5
1934 $movkey -64($key),$rndkey1
1936 movdqa $twres,$twtmp
1937 aesenc $rndkey0,$inout0
1939 pxor @tweak[5],@tweak[0]
1940 aesenc $rndkey0,$inout1
1942 paddq @tweak[5],@tweak[5]
1943 aesenc $rndkey0,$inout2
1944 aesenc $rndkey0,$inout3
1946 movaps @tweak[1],@tweak[2]
1947 aesenc $rndkey0,$inout4
1948 pxor $twtmp,@tweak[5]
1949 movdqa $twres,$twtmp
1950 aesenc $rndkey0,$inout5
1951 $movkey -48($key),$rndkey0
1954 aesenc $rndkey1,$inout0
1955 pxor @tweak[5],@tweak[1]
1957 aesenc $rndkey1,$inout1
1958 paddq @tweak[5],@tweak[5]
1960 aesenc $rndkey1,$inout2
1961 aesenc $rndkey1,$inout3
1962 movdqa @tweak[3],`16*3`(%rsp)
1963 pxor $twtmp,@tweak[5]
1964 aesenc $rndkey1,$inout4
1965 movaps @tweak[2],@tweak[3]
1966 movdqa $twres,$twtmp
1967 aesenc $rndkey1,$inout5
1968 $movkey -32($key),$rndkey1
1971 aesenc $rndkey0,$inout0
1972 pxor @tweak[5],@tweak[2]
1974 aesenc $rndkey0,$inout1
1975 paddq @tweak[5],@tweak[5]
1977 aesenc $rndkey0,$inout2
1978 aesenc $rndkey0,$inout3
1979 aesenc $rndkey0,$inout4
1980 pxor $twtmp,@tweak[5]
1981 movaps @tweak[3],@tweak[4]
1982 aesenc $rndkey0,$inout5
1984 movdqa $twres,$rndkey0
1986 aesenc $rndkey1,$inout0
1987 pxor @tweak[5],@tweak[3]
1989 aesenc $rndkey1,$inout1
1990 paddq @tweak[5],@tweak[5]
1991 pand $twmask,$rndkey0
1992 aesenc $rndkey1,$inout2
1993 aesenc $rndkey1,$inout3
1994 pxor $rndkey0,@tweak[5]
1995 $movkey ($key_),$rndkey0
1996 aesenc $rndkey1,$inout4
1997 aesenc $rndkey1,$inout5
1998 $movkey 16($key_),$rndkey1
2000 pxor @tweak[5],@tweak[4]
2001 aesenclast `16*0`(%rsp),$inout0
2003 paddq @tweak[5],@tweak[5]
2004 aesenclast `16*1`(%rsp),$inout1
2005 aesenclast `16*2`(%rsp),$inout2
2007 mov %r10,%rax # restore $rounds
2008 aesenclast `16*3`(%rsp),$inout3
2009 aesenclast `16*4`(%rsp),$inout4
2010 aesenclast `16*5`(%rsp),$inout5
2011 pxor $twres,@tweak[5]
2013 lea `16*6`($out),$out # $out+=6*16
2014 movups $inout0,`-16*6`($out) # store 6 output blocks
2015 movups $inout1,`-16*5`($out)
2016 movups $inout2,`-16*4`($out)
2017 movups $inout3,`-16*3`($out)
2018 movups $inout4,`-16*2`($out)
2019 movups $inout5,`-16*1`($out)
2021 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow
2025 mov $key_,$key # restore $key
2026 shr \$4,$rounds # restore original value
2029 # at the point @tweak[0..5] are populated with tweak values
2030 mov $rounds,$rnds_ # backup $rounds
2031 pxor $rndkey0,@tweak[0]
2032 add \$16*6,$len # restore real remaining $len
2033 jz .Lxts_enc_done # done if ($len==0)
2035 pxor $rndkey0,@tweak[1]
2037 jb .Lxts_enc_one # $len is 1*16
2038 pxor $rndkey0,@tweak[2]
2039 je .Lxts_enc_two # $len is 2*16
2041 pxor $rndkey0,@tweak[3]
2043 jb .Lxts_enc_three # $len is 3*16
2044 pxor $rndkey0,@tweak[4]
2045 je .Lxts_enc_four # $len is 4*16
2047 movdqu ($inp),$inout0 # $len is 5*16
2048 movdqu 16*1($inp),$inout1
2049 movdqu 16*2($inp),$inout2
2050 pxor @tweak[0],$inout0
2051 movdqu 16*3($inp),$inout3
2052 pxor @tweak[1],$inout1
2053 movdqu 16*4($inp),$inout4
2054 lea 16*5($inp),$inp # $inp+=5*16
2055 pxor @tweak[2],$inout2
2056 pxor @tweak[3],$inout3
2057 pxor @tweak[4],$inout4
2058 pxor $inout5,$inout5
2060 call _aesni_encrypt6
2062 xorps @tweak[0],$inout0
2063 movdqa @tweak[5],@tweak[0]
2064 xorps @tweak[1],$inout1
2065 xorps @tweak[2],$inout2
2066 movdqu $inout0,($out) # store 5 output blocks
2067 xorps @tweak[3],$inout3
2068 movdqu $inout1,16*1($out)
2069 xorps @tweak[4],$inout4
2070 movdqu $inout2,16*2($out)
2071 movdqu $inout3,16*3($out)
2072 movdqu $inout4,16*4($out)
2073 lea 16*5($out),$out # $out+=5*16
2078 movups ($inp),$inout0
2079 lea 16*1($inp),$inp # inp+=1*16
2080 xorps @tweak[0],$inout0
2082 &aesni_generate1("enc",$key,$rounds);
2084 xorps @tweak[0],$inout0
2085 movdqa @tweak[1],@tweak[0]
2086 movups $inout0,($out) # store one output block
2087 lea 16*1($out),$out # $out+=1*16
2092 movups ($inp),$inout0
2093 movups 16($inp),$inout1
2094 lea 32($inp),$inp # $inp+=2*16
2095 xorps @tweak[0],$inout0
2096 xorps @tweak[1],$inout1
2098 call _aesni_encrypt2
2100 xorps @tweak[0],$inout0
2101 movdqa @tweak[2],@tweak[0]
2102 xorps @tweak[1],$inout1
2103 movups $inout0,($out) # store 2 output blocks
2104 movups $inout1,16*1($out)
2105 lea 16*2($out),$out # $out+=2*16
2110 movups ($inp),$inout0
2111 movups 16*1($inp),$inout1
2112 movups 16*2($inp),$inout2
2113 lea 16*3($inp),$inp # $inp+=3*16
2114 xorps @tweak[0],$inout0
2115 xorps @tweak[1],$inout1
2116 xorps @tweak[2],$inout2
2118 call _aesni_encrypt3
2120 xorps @tweak[0],$inout0
2121 movdqa @tweak[3],@tweak[0]
2122 xorps @tweak[1],$inout1
2123 xorps @tweak[2],$inout2
2124 movups $inout0,($out) # store 3 output blocks
2125 movups $inout1,16*1($out)
2126 movups $inout2,16*2($out)
2127 lea 16*3($out),$out # $out+=3*16
2132 movups ($inp),$inout0
2133 movups 16*1($inp),$inout1
2134 movups 16*2($inp),$inout2
2135 xorps @tweak[0],$inout0
2136 movups 16*3($inp),$inout3
2137 lea 16*4($inp),$inp # $inp+=4*16
2138 xorps @tweak[1],$inout1
2139 xorps @tweak[2],$inout2
2140 xorps @tweak[3],$inout3
2142 call _aesni_encrypt4
2144 pxor @tweak[0],$inout0
2145 movdqa @tweak[4],@tweak[0]
2146 pxor @tweak[1],$inout1
2147 pxor @tweak[2],$inout2
2148 movdqu $inout0,($out) # store 4 output blocks
2149 pxor @tweak[3],$inout3
2150 movdqu $inout1,16*1($out)
2151 movdqu $inout2,16*2($out)
2152 movdqu $inout3,16*3($out)
2153 lea 16*4($out),$out # $out+=4*16
2158 and \$15,$len_ # see if $len%16 is 0
2163 movzb ($inp),%eax # borrow $rounds ...
2164 movzb -16($out),%ecx # ... and $key
2172 sub $len_,$out # rewind $out
2173 mov $key_,$key # restore $key
2174 mov $rnds_,$rounds # restore $rounds
2176 movups -16($out),$inout0
2177 xorps @tweak[0],$inout0
2179 &aesni_generate1("enc",$key,$rounds);
2181 xorps @tweak[0],$inout0
2182 movups $inout0,-16($out)
2185 xorps %xmm0,%xmm0 # clear register bank
2192 $code.=<<___ if (!$win64);
2195 movaps %xmm0,0x00(%rsp) # clear stack
2197 movaps %xmm0,0x10(%rsp)
2199 movaps %xmm0,0x20(%rsp)
2201 movaps %xmm0,0x30(%rsp)
2203 movaps %xmm0,0x40(%rsp)
2205 movaps %xmm0,0x50(%rsp)
2207 movaps %xmm0,0x60(%rsp)
2211 $code.=<<___ if ($win64);
2212 movaps -0xa8(%r11),%xmm6
2213 movaps %xmm0,-0xa8(%r11) # clear stack
2214 movaps -0x98(%r11),%xmm7
2215 movaps %xmm0,-0x98(%r11)
2216 movaps -0x88(%r11),%xmm8
2217 movaps %xmm0,-0x88(%r11)
2218 movaps -0x78(%r11),%xmm9
2219 movaps %xmm0,-0x78(%r11)
2220 movaps -0x68(%r11),%xmm10
2221 movaps %xmm0,-0x68(%r11)
2222 movaps -0x58(%r11),%xmm11
2223 movaps %xmm0,-0x58(%r11)
2224 movaps -0x48(%r11),%xmm12
2225 movaps %xmm0,-0x48(%r11)
2226 movaps -0x38(%r11),%xmm13
2227 movaps %xmm0,-0x38(%r11)
2228 movaps -0x28(%r11),%xmm14
2229 movaps %xmm0,-0x28(%r11)
2230 movaps -0x18(%r11),%xmm15
2231 movaps %xmm0,-0x18(%r11)
2232 movaps %xmm0,0x00(%rsp)
2233 movaps %xmm0,0x10(%rsp)
2234 movaps %xmm0,0x20(%rsp)
2235 movaps %xmm0,0x30(%rsp)
2236 movaps %xmm0,0x40(%rsp)
2237 movaps %xmm0,0x50(%rsp)
2238 movaps %xmm0,0x60(%rsp)
2244 .cfi_def_cfa_register %rsp
2248 .size aesni_xts_encrypt,.-aesni_xts_encrypt
2252 .globl aesni_xts_decrypt
2253 .type aesni_xts_decrypt,\@function,6
2257 lea (%rsp),%r11 # frame pointer
2258 .cfi_def_cfa_register %r11
2261 sub \$$frame_size,%rsp
2262 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
2264 $code.=<<___ if ($win64);
2265 movaps %xmm6,-0xa8(%r11) # offload everything
2266 movaps %xmm7,-0x98(%r11)
2267 movaps %xmm8,-0x88(%r11)
2268 movaps %xmm9,-0x78(%r11)
2269 movaps %xmm10,-0x68(%r11)
2270 movaps %xmm11,-0x58(%r11)
2271 movaps %xmm12,-0x48(%r11)
2272 movaps %xmm13,-0x38(%r11)
2273 movaps %xmm14,-0x28(%r11)
2274 movaps %xmm15,-0x18(%r11)
2278 movups ($ivp),$inout0 # load clear-text tweak
2279 mov 240($key2),$rounds # key2->rounds
2280 mov 240($key),$rnds_ # key1->rounds
2282 # generate the tweak
2283 &aesni_generate1("enc",$key2,$rounds,$inout0);
2285 xor %eax,%eax # if ($len%16) len-=16;
2291 $movkey ($key),$rndkey0 # zero round key
2292 mov $key,$key_ # backup $key
2293 mov $rnds_,$rounds # backup $rounds
2295 mov $len,$len_ # backup $len
2298 $movkey 16($key,$rnds_),$rndkey1 # last round key
2300 movdqa .Lxts_magic(%rip),$twmask
2301 movdqa $inout0,@tweak[5]
2302 pshufd \$0x5f,$inout0,$twres
2303 pxor $rndkey0,$rndkey1
2305 for ($i=0;$i<4;$i++) {
2307 movdqa $twres,$twtmp
2309 movdqa @tweak[5],@tweak[$i]
2310 psrad \$31,$twtmp # broadcast upper bits
2311 paddq @tweak[5],@tweak[5]
2313 pxor $rndkey0,@tweak[$i]
2314 pxor $twtmp,@tweak[5]
2318 movdqa @tweak[5],@tweak[4]
2320 paddq @tweak[5],@tweak[5]
2322 pxor $rndkey0,@tweak[4]
2323 pxor $twres,@tweak[5]
2324 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
2327 jc .Lxts_dec_short # if $len-=6*16 borrowed
2330 lea 32($key_,$rnds_),$key # end of key schedule
2331 sub %r10,%rax # twisted $rounds
2332 $movkey 16($key_),$rndkey1
2333 mov %rax,%r10 # backup twisted $rounds
2334 lea .Lxts_magic(%rip),%r8
2335 jmp .Lxts_dec_grandloop
2338 .Lxts_dec_grandloop:
2339 movdqu `16*0`($inp),$inout0 # load input
2340 movdqa $rndkey0,$twmask
2341 movdqu `16*1`($inp),$inout1
2342 pxor @tweak[0],$inout0 # intput^=tweak^round[0]
2343 movdqu `16*2`($inp),$inout2
2344 pxor @tweak[1],$inout1
2345 aesdec $rndkey1,$inout0
2346 movdqu `16*3`($inp),$inout3
2347 pxor @tweak[2],$inout2
2348 aesdec $rndkey1,$inout1
2349 movdqu `16*4`($inp),$inout4
2350 pxor @tweak[3],$inout3
2351 aesdec $rndkey1,$inout2
2352 movdqu `16*5`($inp),$inout5
2353 pxor @tweak[5],$twmask # round[0]^=tweak[5]
2354 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
2355 pxor @tweak[4],$inout4
2356 aesdec $rndkey1,$inout3
2357 $movkey 32($key_),$rndkey0
2358 lea `16*6`($inp),$inp
2359 pxor $twmask,$inout5
2361 pxor $twres,@tweak[0] # calculate tweaks^round[last]
2362 aesdec $rndkey1,$inout4
2363 pxor $twres,@tweak[1]
2364 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
2365 aesdec $rndkey1,$inout5
2366 $movkey 48($key_),$rndkey1
2367 pxor $twres,@tweak[2]
2369 aesdec $rndkey0,$inout0
2370 pxor $twres,@tweak[3]
2371 movdqa @tweak[1],`16*1`(%rsp)
2372 aesdec $rndkey0,$inout1
2373 pxor $twres,@tweak[4]
2374 movdqa @tweak[2],`16*2`(%rsp)
2375 aesdec $rndkey0,$inout2
2376 aesdec $rndkey0,$inout3
2378 movdqa @tweak[4],`16*4`(%rsp)
2379 aesdec $rndkey0,$inout4
2380 aesdec $rndkey0,$inout5
2381 $movkey 64($key_),$rndkey0
2382 movdqa $twmask,`16*5`(%rsp)
2383 pshufd \$0x5f,@tweak[5],$twres
2387 aesdec $rndkey1,$inout0
2388 aesdec $rndkey1,$inout1
2389 aesdec $rndkey1,$inout2
2390 aesdec $rndkey1,$inout3
2391 aesdec $rndkey1,$inout4
2392 aesdec $rndkey1,$inout5
2393 $movkey -64($key,%rax),$rndkey1
2396 aesdec $rndkey0,$inout0
2397 aesdec $rndkey0,$inout1
2398 aesdec $rndkey0,$inout2
2399 aesdec $rndkey0,$inout3
2400 aesdec $rndkey0,$inout4
2401 aesdec $rndkey0,$inout5
2402 $movkey -80($key,%rax),$rndkey0
2405 movdqa (%r8),$twmask # start calculating next tweak
2406 movdqa $twres,$twtmp
2408 aesdec $rndkey1,$inout0
2409 paddq @tweak[5],@tweak[5]
2411 aesdec $rndkey1,$inout1
2413 $movkey ($key_),@tweak[0] # load round[0]
2414 aesdec $rndkey1,$inout2
2415 aesdec $rndkey1,$inout3
2416 aesdec $rndkey1,$inout4
2417 pxor $twtmp,@tweak[5]
2418 movaps @tweak[0],@tweak[1] # copy round[0]
2419 aesdec $rndkey1,$inout5
2420 $movkey -64($key),$rndkey1
2422 movdqa $twres,$twtmp
2423 aesdec $rndkey0,$inout0
2425 pxor @tweak[5],@tweak[0]
2426 aesdec $rndkey0,$inout1
2428 paddq @tweak[5],@tweak[5]
2429 aesdec $rndkey0,$inout2
2430 aesdec $rndkey0,$inout3
2432 movaps @tweak[1],@tweak[2]
2433 aesdec $rndkey0,$inout4
2434 pxor $twtmp,@tweak[5]
2435 movdqa $twres,$twtmp
2436 aesdec $rndkey0,$inout5
2437 $movkey -48($key),$rndkey0
2440 aesdec $rndkey1,$inout0
2441 pxor @tweak[5],@tweak[1]
2443 aesdec $rndkey1,$inout1
2444 paddq @tweak[5],@tweak[5]
2446 aesdec $rndkey1,$inout2
2447 aesdec $rndkey1,$inout3
2448 movdqa @tweak[3],`16*3`(%rsp)
2449 pxor $twtmp,@tweak[5]
2450 aesdec $rndkey1,$inout4
2451 movaps @tweak[2],@tweak[3]
2452 movdqa $twres,$twtmp
2453 aesdec $rndkey1,$inout5
2454 $movkey -32($key),$rndkey1
2457 aesdec $rndkey0,$inout0
2458 pxor @tweak[5],@tweak[2]
2460 aesdec $rndkey0,$inout1
2461 paddq @tweak[5],@tweak[5]
2463 aesdec $rndkey0,$inout2
2464 aesdec $rndkey0,$inout3
2465 aesdec $rndkey0,$inout4
2466 pxor $twtmp,@tweak[5]
2467 movaps @tweak[3],@tweak[4]
2468 aesdec $rndkey0,$inout5
2470 movdqa $twres,$rndkey0
2472 aesdec $rndkey1,$inout0
2473 pxor @tweak[5],@tweak[3]
2475 aesdec $rndkey1,$inout1
2476 paddq @tweak[5],@tweak[5]
2477 pand $twmask,$rndkey0
2478 aesdec $rndkey1,$inout2
2479 aesdec $rndkey1,$inout3
2480 pxor $rndkey0,@tweak[5]
2481 $movkey ($key_),$rndkey0
2482 aesdec $rndkey1,$inout4
2483 aesdec $rndkey1,$inout5
2484 $movkey 16($key_),$rndkey1
2486 pxor @tweak[5],@tweak[4]
2487 aesdeclast `16*0`(%rsp),$inout0
2489 paddq @tweak[5],@tweak[5]
2490 aesdeclast `16*1`(%rsp),$inout1
2491 aesdeclast `16*2`(%rsp),$inout2
2493 mov %r10,%rax # restore $rounds
2494 aesdeclast `16*3`(%rsp),$inout3
2495 aesdeclast `16*4`(%rsp),$inout4
2496 aesdeclast `16*5`(%rsp),$inout5
2497 pxor $twres,@tweak[5]
2499 lea `16*6`($out),$out # $out+=6*16
2500 movups $inout0,`-16*6`($out) # store 6 output blocks
2501 movups $inout1,`-16*5`($out)
2502 movups $inout2,`-16*4`($out)
2503 movups $inout3,`-16*3`($out)
2504 movups $inout4,`-16*2`($out)
2505 movups $inout5,`-16*1`($out)
2507 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow
2511 mov $key_,$key # restore $key
2512 shr \$4,$rounds # restore original value
2515 # at the point @tweak[0..5] are populated with tweak values
2516 mov $rounds,$rnds_ # backup $rounds
2517 pxor $rndkey0,@tweak[0]
2518 pxor $rndkey0,@tweak[1]
2519 add \$16*6,$len # restore real remaining $len
2520 jz .Lxts_dec_done # done if ($len==0)
2522 pxor $rndkey0,@tweak[2]
2524 jb .Lxts_dec_one # $len is 1*16
2525 pxor $rndkey0,@tweak[3]
2526 je .Lxts_dec_two # $len is 2*16
2528 pxor $rndkey0,@tweak[4]
2530 jb .Lxts_dec_three # $len is 3*16
2531 je .Lxts_dec_four # $len is 4*16
2533 movdqu ($inp),$inout0 # $len is 5*16
2534 movdqu 16*1($inp),$inout1
2535 movdqu 16*2($inp),$inout2
2536 pxor @tweak[0],$inout0
2537 movdqu 16*3($inp),$inout3
2538 pxor @tweak[1],$inout1
2539 movdqu 16*4($inp),$inout4
2540 lea 16*5($inp),$inp # $inp+=5*16
2541 pxor @tweak[2],$inout2
2542 pxor @tweak[3],$inout3
2543 pxor @tweak[4],$inout4
2545 call _aesni_decrypt6
2547 xorps @tweak[0],$inout0
2548 xorps @tweak[1],$inout1
2549 xorps @tweak[2],$inout2
2550 movdqu $inout0,($out) # store 5 output blocks
2551 xorps @tweak[3],$inout3
2552 movdqu $inout1,16*1($out)
2553 xorps @tweak[4],$inout4
2554 movdqu $inout2,16*2($out)
2556 movdqu $inout3,16*3($out)
2557 pcmpgtd @tweak[5],$twtmp
2558 movdqu $inout4,16*4($out)
2559 lea 16*5($out),$out # $out+=5*16
2560 pshufd \$0x13,$twtmp,@tweak[1] # $twres
2564 movdqa @tweak[5],@tweak[0]
2565 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
2566 pand $twmask,@tweak[1] # isolate carry and residue
2567 pxor @tweak[5],@tweak[1]
2572 movups ($inp),$inout0
2573 lea 16*1($inp),$inp # $inp+=1*16
2574 xorps @tweak[0],$inout0
2576 &aesni_generate1("dec",$key,$rounds);
2578 xorps @tweak[0],$inout0
2579 movdqa @tweak[1],@tweak[0]
2580 movups $inout0,($out) # store one output block
2581 movdqa @tweak[2],@tweak[1]
2582 lea 16*1($out),$out # $out+=1*16
2587 movups ($inp),$inout0
2588 movups 16($inp),$inout1
2589 lea 32($inp),$inp # $inp+=2*16
2590 xorps @tweak[0],$inout0
2591 xorps @tweak[1],$inout1
2593 call _aesni_decrypt2
2595 xorps @tweak[0],$inout0
2596 movdqa @tweak[2],@tweak[0]
2597 xorps @tweak[1],$inout1
2598 movdqa @tweak[3],@tweak[1]
2599 movups $inout0,($out) # store 2 output blocks
2600 movups $inout1,16*1($out)
2601 lea 16*2($out),$out # $out+=2*16
2606 movups ($inp),$inout0
2607 movups 16*1($inp),$inout1
2608 movups 16*2($inp),$inout2
2609 lea 16*3($inp),$inp # $inp+=3*16
2610 xorps @tweak[0],$inout0
2611 xorps @tweak[1],$inout1
2612 xorps @tweak[2],$inout2
2614 call _aesni_decrypt3
2616 xorps @tweak[0],$inout0
2617 movdqa @tweak[3],@tweak[0]
2618 xorps @tweak[1],$inout1
2619 movdqa @tweak[4],@tweak[1]
2620 xorps @tweak[2],$inout2
2621 movups $inout0,($out) # store 3 output blocks
2622 movups $inout1,16*1($out)
2623 movups $inout2,16*2($out)
2624 lea 16*3($out),$out # $out+=3*16
2629 movups ($inp),$inout0
2630 movups 16*1($inp),$inout1
2631 movups 16*2($inp),$inout2
2632 xorps @tweak[0],$inout0
2633 movups 16*3($inp),$inout3
2634 lea 16*4($inp),$inp # $inp+=4*16
2635 xorps @tweak[1],$inout1
2636 xorps @tweak[2],$inout2
2637 xorps @tweak[3],$inout3
2639 call _aesni_decrypt4
2641 pxor @tweak[0],$inout0
2642 movdqa @tweak[4],@tweak[0]
2643 pxor @tweak[1],$inout1
2644 movdqa @tweak[5],@tweak[1]
2645 pxor @tweak[2],$inout2
2646 movdqu $inout0,($out) # store 4 output blocks
2647 pxor @tweak[3],$inout3
2648 movdqu $inout1,16*1($out)
2649 movdqu $inout2,16*2($out)
2650 movdqu $inout3,16*3($out)
2651 lea 16*4($out),$out # $out+=4*16
2656 and \$15,$len_ # see if $len%16 is 0
2660 mov $key_,$key # restore $key
2661 mov $rnds_,$rounds # restore $rounds
2663 movups ($inp),$inout0
2664 xorps @tweak[1],$inout0
2666 &aesni_generate1("dec",$key,$rounds);
2668 xorps @tweak[1],$inout0
2669 movups $inout0,($out)
2672 movzb 16($inp),%eax # borrow $rounds ...
2673 movzb ($out),%ecx # ... and $key
2681 sub $len_,$out # rewind $out
2682 mov $key_,$key # restore $key
2683 mov $rnds_,$rounds # restore $rounds
2685 movups ($out),$inout0
2686 xorps @tweak[0],$inout0
2688 &aesni_generate1("dec",$key,$rounds);
2690 xorps @tweak[0],$inout0
2691 movups $inout0,($out)
2694 xorps %xmm0,%xmm0 # clear register bank
2701 $code.=<<___ if (!$win64);
2704 movaps %xmm0,0x00(%rsp) # clear stack
2706 movaps %xmm0,0x10(%rsp)
2708 movaps %xmm0,0x20(%rsp)
2710 movaps %xmm0,0x30(%rsp)
2712 movaps %xmm0,0x40(%rsp)
2714 movaps %xmm0,0x50(%rsp)
2716 movaps %xmm0,0x60(%rsp)
2720 $code.=<<___ if ($win64);
2721 movaps -0xa8(%r11),%xmm6
2722 movaps %xmm0,-0xa8(%r11) # clear stack
2723 movaps -0x98(%r11),%xmm7
2724 movaps %xmm0,-0x98(%r11)
2725 movaps -0x88(%r11),%xmm8
2726 movaps %xmm0,-0x88(%r11)
2727 movaps -0x78(%r11),%xmm9
2728 movaps %xmm0,-0x78(%r11)
2729 movaps -0x68(%r11),%xmm10
2730 movaps %xmm0,-0x68(%r11)
2731 movaps -0x58(%r11),%xmm11
2732 movaps %xmm0,-0x58(%r11)
2733 movaps -0x48(%r11),%xmm12
2734 movaps %xmm0,-0x48(%r11)
2735 movaps -0x38(%r11),%xmm13
2736 movaps %xmm0,-0x38(%r11)
2737 movaps -0x28(%r11),%xmm14
2738 movaps %xmm0,-0x28(%r11)
2739 movaps -0x18(%r11),%xmm15
2740 movaps %xmm0,-0x18(%r11)
2741 movaps %xmm0,0x00(%rsp)
2742 movaps %xmm0,0x10(%rsp)
2743 movaps %xmm0,0x20(%rsp)
2744 movaps %xmm0,0x30(%rsp)
2745 movaps %xmm0,0x40(%rsp)
2746 movaps %xmm0,0x50(%rsp)
2747 movaps %xmm0,0x60(%rsp)
2753 .cfi_def_cfa_register %rsp
2757 .size aesni_xts_decrypt,.-aesni_xts_decrypt
2761 ######################################################################
2762 # void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
2763 # const AES_KEY *key, unsigned int start_block_num,
2764 # unsigned char offset_i[16], const unsigned char L_[][16],
2765 # unsigned char checksum[16]);
2768 my @offset=map("%xmm$_",(10..15));
2769 my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
2770 my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments
2771 my ($L_p,$checksum_p) = ("%rbx","%rbp");
2772 my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
2773 my $seventh_arg = $win64 ? 56 : 8;
2777 .globl aesni_ocb_encrypt
2778 .type aesni_ocb_encrypt,\@function,6
2794 $code.=<<___ if ($win64);
2795 lea -0xa0(%rsp),%rsp
2796 movaps %xmm6,0x00(%rsp) # offload everything
2797 movaps %xmm7,0x10(%rsp)
2798 movaps %xmm8,0x20(%rsp)
2799 movaps %xmm9,0x30(%rsp)
2800 movaps %xmm10,0x40(%rsp)
2801 movaps %xmm11,0x50(%rsp)
2802 movaps %xmm12,0x60(%rsp)
2803 movaps %xmm13,0x70(%rsp)
2804 movaps %xmm14,0x80(%rsp)
2805 movaps %xmm15,0x90(%rsp)
2809 mov $seventh_arg(%rax),$L_p # 7th argument
2810 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
2812 mov 240($key),$rnds_
2815 $movkey ($key),$rndkey0l # round[0]
2816 $movkey 16($key,$rnds_),$rndkey1 # round[last]
2818 movdqu ($offset_p),@offset[5] # load last offset_i
2819 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
2820 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
2823 lea 32($key_,$rnds_),$key
2824 $movkey 16($key_),$rndkey1 # round[1]
2825 sub %r10,%rax # twisted $rounds
2826 mov %rax,%r10 # backup twisted $rounds
2828 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
2829 movdqu ($checksum_p),$checksum # load checksum
2831 test \$1,$block_num # is first block number odd?
2837 movdqu ($L_p,$i1),$inout5 # borrow
2838 movdqu ($inp),$inout0
2843 movdqa $inout5,@offset[5]
2844 movups $inout0,($out)
2850 lea 1($block_num),$i1 # even-numbered blocks
2851 lea 3($block_num),$i3
2852 lea 5($block_num),$i5
2853 lea 6($block_num),$block_num
2854 bsf $i1,$i1 # ntz(block)
2857 shl \$4,$i1 # ntz(block) -> table offset
2863 jmp .Locb_enc_grandloop
2866 .Locb_enc_grandloop:
2867 movdqu `16*0`($inp),$inout0 # load input
2868 movdqu `16*1`($inp),$inout1
2869 movdqu `16*2`($inp),$inout2
2870 movdqu `16*3`($inp),$inout3
2871 movdqu `16*4`($inp),$inout4
2872 movdqu `16*5`($inp),$inout5
2873 lea `16*6`($inp),$inp
2877 movups $inout0,`16*0`($out) # store output
2878 movups $inout1,`16*1`($out)
2879 movups $inout2,`16*2`($out)
2880 movups $inout3,`16*3`($out)
2881 movups $inout4,`16*4`($out)
2882 movups $inout5,`16*5`($out)
2883 lea `16*6`($out),$out
2885 jnc .Locb_enc_grandloop
2891 movdqu `16*0`($inp),$inout0
2894 movdqu `16*1`($inp),$inout1
2897 movdqu `16*2`($inp),$inout2
2900 movdqu `16*3`($inp),$inout3
2903 movdqu `16*4`($inp),$inout4
2904 pxor $inout5,$inout5
2908 movdqa @offset[4],@offset[5]
2909 movups $inout0,`16*0`($out)
2910 movups $inout1,`16*1`($out)
2911 movups $inout2,`16*2`($out)
2912 movups $inout3,`16*3`($out)
2913 movups $inout4,`16*4`($out)
2919 movdqa @offset[0],$inout5 # borrow
2923 movdqa $inout5,@offset[5]
2924 movups $inout0,`16*0`($out)
2929 pxor $inout2,$inout2
2930 pxor $inout3,$inout3
2934 movdqa @offset[1],@offset[5]
2935 movups $inout0,`16*0`($out)
2936 movups $inout1,`16*1`($out)
2942 pxor $inout3,$inout3
2946 movdqa @offset[2],@offset[5]
2947 movups $inout0,`16*0`($out)
2948 movups $inout1,`16*1`($out)
2949 movups $inout2,`16*2`($out)
2957 movdqa @offset[3],@offset[5]
2958 movups $inout0,`16*0`($out)
2959 movups $inout1,`16*1`($out)
2960 movups $inout2,`16*2`($out)
2961 movups $inout3,`16*3`($out)
2964 pxor $rndkey0,@offset[5] # "remove" round[last]
2965 movdqu $checksum,($checksum_p) # store checksum
2966 movdqu @offset[5],($offset_p) # store last offset_i
2968 xorps %xmm0,%xmm0 # clear register bank
2975 $code.=<<___ if (!$win64);
2989 $code.=<<___ if ($win64);
2990 movaps 0x00(%rsp),%xmm6
2991 movaps %xmm0,0x00(%rsp) # clear stack
2992 movaps 0x10(%rsp),%xmm7
2993 movaps %xmm0,0x10(%rsp)
2994 movaps 0x20(%rsp),%xmm8
2995 movaps %xmm0,0x20(%rsp)
2996 movaps 0x30(%rsp),%xmm9
2997 movaps %xmm0,0x30(%rsp)
2998 movaps 0x40(%rsp),%xmm10
2999 movaps %xmm0,0x40(%rsp)
3000 movaps 0x50(%rsp),%xmm11
3001 movaps %xmm0,0x50(%rsp)
3002 movaps 0x60(%rsp),%xmm12
3003 movaps %xmm0,0x60(%rsp)
3004 movaps 0x70(%rsp),%xmm13
3005 movaps %xmm0,0x70(%rsp)
3006 movaps 0x80(%rsp),%xmm14
3007 movaps %xmm0,0x80(%rsp)
3008 movaps 0x90(%rsp),%xmm15
3009 movaps %xmm0,0x90(%rsp)
3010 lea 0xa0+0x28(%rsp),%rax
3025 .cfi_def_cfa_register %rsp
3029 .size aesni_ocb_encrypt,.-aesni_ocb_encrypt
3031 .type __ocb_encrypt6,\@abi-omnipotent
3034 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3035 movdqu ($L_p,$i1),@offset[1]
3036 movdqa @offset[0],@offset[2]
3037 movdqu ($L_p,$i3),@offset[3]
3038 movdqa @offset[0],@offset[4]
3039 pxor @offset[5],@offset[0]
3040 movdqu ($L_p,$i5),@offset[5]
3041 pxor @offset[0],@offset[1]
3042 pxor $inout0,$checksum # accumulate checksum
3043 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3044 pxor @offset[1],@offset[2]
3045 pxor $inout1,$checksum
3046 pxor @offset[1],$inout1
3047 pxor @offset[2],@offset[3]
3048 pxor $inout2,$checksum
3049 pxor @offset[2],$inout2
3050 pxor @offset[3],@offset[4]
3051 pxor $inout3,$checksum
3052 pxor @offset[3],$inout3
3053 pxor @offset[4],@offset[5]
3054 pxor $inout4,$checksum
3055 pxor @offset[4],$inout4
3056 pxor $inout5,$checksum
3057 pxor @offset[5],$inout5
3058 $movkey 32($key_),$rndkey0
3060 lea 1($block_num),$i1 # even-numbered blocks
3061 lea 3($block_num),$i3
3062 lea 5($block_num),$i5
3064 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3065 bsf $i1,$i1 # ntz(block)
3069 aesenc $rndkey1,$inout0
3070 aesenc $rndkey1,$inout1
3071 aesenc $rndkey1,$inout2
3072 aesenc $rndkey1,$inout3
3073 pxor $rndkey0l,@offset[1]
3074 pxor $rndkey0l,@offset[2]
3075 aesenc $rndkey1,$inout4
3076 pxor $rndkey0l,@offset[3]
3077 pxor $rndkey0l,@offset[4]
3078 aesenc $rndkey1,$inout5
3079 $movkey 48($key_),$rndkey1
3080 pxor $rndkey0l,@offset[5]
3082 aesenc $rndkey0,$inout0
3083 aesenc $rndkey0,$inout1
3084 aesenc $rndkey0,$inout2
3085 aesenc $rndkey0,$inout3
3086 aesenc $rndkey0,$inout4
3087 aesenc $rndkey0,$inout5
3088 $movkey 64($key_),$rndkey0
3089 shl \$4,$i1 # ntz(block) -> table offset
3095 aesenc $rndkey1,$inout0
3096 aesenc $rndkey1,$inout1
3097 aesenc $rndkey1,$inout2
3098 aesenc $rndkey1,$inout3
3099 aesenc $rndkey1,$inout4
3100 aesenc $rndkey1,$inout5
3101 $movkey ($key,%rax),$rndkey1
3104 aesenc $rndkey0,$inout0
3105 aesenc $rndkey0,$inout1
3106 aesenc $rndkey0,$inout2
3107 aesenc $rndkey0,$inout3
3108 aesenc $rndkey0,$inout4
3109 aesenc $rndkey0,$inout5
3110 $movkey -16($key,%rax),$rndkey0
3113 aesenc $rndkey1,$inout0
3114 aesenc $rndkey1,$inout1
3115 aesenc $rndkey1,$inout2
3116 aesenc $rndkey1,$inout3
3117 aesenc $rndkey1,$inout4
3118 aesenc $rndkey1,$inout5
3119 $movkey 16($key_),$rndkey1
3122 aesenclast @offset[0],$inout0
3123 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3124 mov %r10,%rax # restore twisted rounds
3125 aesenclast @offset[1],$inout1
3126 aesenclast @offset[2],$inout2
3127 aesenclast @offset[3],$inout3
3128 aesenclast @offset[4],$inout4
3129 aesenclast @offset[5],$inout5
3131 .size __ocb_encrypt6,.-__ocb_encrypt6
3133 .type __ocb_encrypt4,\@abi-omnipotent
3136 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3137 movdqu ($L_p,$i1),@offset[1]
3138 movdqa @offset[0],@offset[2]
3139 movdqu ($L_p,$i3),@offset[3]
3140 pxor @offset[5],@offset[0]
3141 pxor @offset[0],@offset[1]
3142 pxor $inout0,$checksum # accumulate checksum
3143 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3144 pxor @offset[1],@offset[2]
3145 pxor $inout1,$checksum
3146 pxor @offset[1],$inout1
3147 pxor @offset[2],@offset[3]
3148 pxor $inout2,$checksum
3149 pxor @offset[2],$inout2
3150 pxor $inout3,$checksum
3151 pxor @offset[3],$inout3
3152 $movkey 32($key_),$rndkey0
3154 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3155 pxor $rndkey0l,@offset[1]
3156 pxor $rndkey0l,@offset[2]
3157 pxor $rndkey0l,@offset[3]
3159 aesenc $rndkey1,$inout0
3160 aesenc $rndkey1,$inout1
3161 aesenc $rndkey1,$inout2
3162 aesenc $rndkey1,$inout3
3163 $movkey 48($key_),$rndkey1
3165 aesenc $rndkey0,$inout0
3166 aesenc $rndkey0,$inout1
3167 aesenc $rndkey0,$inout2
3168 aesenc $rndkey0,$inout3
3169 $movkey 64($key_),$rndkey0
3174 aesenc $rndkey1,$inout0
3175 aesenc $rndkey1,$inout1
3176 aesenc $rndkey1,$inout2
3177 aesenc $rndkey1,$inout3
3178 $movkey ($key,%rax),$rndkey1
3181 aesenc $rndkey0,$inout0
3182 aesenc $rndkey0,$inout1
3183 aesenc $rndkey0,$inout2
3184 aesenc $rndkey0,$inout3
3185 $movkey -16($key,%rax),$rndkey0
3188 aesenc $rndkey1,$inout0
3189 aesenc $rndkey1,$inout1
3190 aesenc $rndkey1,$inout2
3191 aesenc $rndkey1,$inout3
3192 $movkey 16($key_),$rndkey1
3193 mov %r10,%rax # restore twisted rounds
3195 aesenclast @offset[0],$inout0
3196 aesenclast @offset[1],$inout1
3197 aesenclast @offset[2],$inout2
3198 aesenclast @offset[3],$inout3
3200 .size __ocb_encrypt4,.-__ocb_encrypt4
3202 .type __ocb_encrypt1,\@abi-omnipotent
3205 pxor @offset[5],$inout5 # offset_i
3206 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3207 pxor $inout0,$checksum # accumulate checksum
3208 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3209 $movkey 32($key_),$rndkey0
3211 aesenc $rndkey1,$inout0
3212 $movkey 48($key_),$rndkey1
3213 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3215 aesenc $rndkey0,$inout0
3216 $movkey 64($key_),$rndkey0
3221 aesenc $rndkey1,$inout0
3222 $movkey ($key,%rax),$rndkey1
3225 aesenc $rndkey0,$inout0
3226 $movkey -16($key,%rax),$rndkey0
3229 aesenc $rndkey1,$inout0
3230 $movkey 16($key_),$rndkey1 # redundant in tail
3231 mov %r10,%rax # restore twisted rounds
3233 aesenclast $inout5,$inout0
3235 .size __ocb_encrypt1,.-__ocb_encrypt1
3237 .globl aesni_ocb_decrypt
3238 .type aesni_ocb_decrypt,\@function,6
3254 $code.=<<___ if ($win64);
3255 lea -0xa0(%rsp),%rsp
3256 movaps %xmm6,0x00(%rsp) # offload everything
3257 movaps %xmm7,0x10(%rsp)
3258 movaps %xmm8,0x20(%rsp)
3259 movaps %xmm9,0x30(%rsp)
3260 movaps %xmm10,0x40(%rsp)
3261 movaps %xmm11,0x50(%rsp)
3262 movaps %xmm12,0x60(%rsp)
3263 movaps %xmm13,0x70(%rsp)
3264 movaps %xmm14,0x80(%rsp)
3265 movaps %xmm15,0x90(%rsp)
3269 mov $seventh_arg(%rax),$L_p # 7th argument
3270 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
3272 mov 240($key),$rnds_
3275 $movkey ($key),$rndkey0l # round[0]
3276 $movkey 16($key,$rnds_),$rndkey1 # round[last]
3278 movdqu ($offset_p),@offset[5] # load last offset_i
3279 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
3280 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
3283 lea 32($key_,$rnds_),$key
3284 $movkey 16($key_),$rndkey1 # round[1]
3285 sub %r10,%rax # twisted $rounds
3286 mov %rax,%r10 # backup twisted $rounds
3288 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3289 movdqu ($checksum_p),$checksum # load checksum
3291 test \$1,$block_num # is first block number odd?
3297 movdqu ($L_p,$i1),$inout5 # borrow
3298 movdqu ($inp),$inout0
3303 movdqa $inout5,@offset[5]
3304 movups $inout0,($out)
3305 xorps $inout0,$checksum # accumulate checksum
3311 lea 1($block_num),$i1 # even-numbered blocks
3312 lea 3($block_num),$i3
3313 lea 5($block_num),$i5
3314 lea 6($block_num),$block_num
3315 bsf $i1,$i1 # ntz(block)
3318 shl \$4,$i1 # ntz(block) -> table offset
3324 jmp .Locb_dec_grandloop
3327 .Locb_dec_grandloop:
3328 movdqu `16*0`($inp),$inout0 # load input
3329 movdqu `16*1`($inp),$inout1
3330 movdqu `16*2`($inp),$inout2
3331 movdqu `16*3`($inp),$inout3
3332 movdqu `16*4`($inp),$inout4
3333 movdqu `16*5`($inp),$inout5
3334 lea `16*6`($inp),$inp
3338 movups $inout0,`16*0`($out) # store output
3339 pxor $inout0,$checksum # accumulate checksum
3340 movups $inout1,`16*1`($out)
3341 pxor $inout1,$checksum
3342 movups $inout2,`16*2`($out)
3343 pxor $inout2,$checksum
3344 movups $inout3,`16*3`($out)
3345 pxor $inout3,$checksum
3346 movups $inout4,`16*4`($out)
3347 pxor $inout4,$checksum
3348 movups $inout5,`16*5`($out)
3349 pxor $inout5,$checksum
3350 lea `16*6`($out),$out
3352 jnc .Locb_dec_grandloop
3358 movdqu `16*0`($inp),$inout0
3361 movdqu `16*1`($inp),$inout1
3364 movdqu `16*2`($inp),$inout2
3367 movdqu `16*3`($inp),$inout3
3370 movdqu `16*4`($inp),$inout4
3371 pxor $inout5,$inout5
3375 movdqa @offset[4],@offset[5]
3376 movups $inout0,`16*0`($out) # store output
3377 pxor $inout0,$checksum # accumulate checksum
3378 movups $inout1,`16*1`($out)
3379 pxor $inout1,$checksum
3380 movups $inout2,`16*2`($out)
3381 pxor $inout2,$checksum
3382 movups $inout3,`16*3`($out)
3383 pxor $inout3,$checksum
3384 movups $inout4,`16*4`($out)
3385 pxor $inout4,$checksum
3391 movdqa @offset[0],$inout5 # borrow
3395 movdqa $inout5,@offset[5]
3396 movups $inout0,`16*0`($out) # store output
3397 xorps $inout0,$checksum # accumulate checksum
3402 pxor $inout2,$inout2
3403 pxor $inout3,$inout3
3407 movdqa @offset[1],@offset[5]
3408 movups $inout0,`16*0`($out) # store output
3409 xorps $inout0,$checksum # accumulate checksum
3410 movups $inout1,`16*1`($out)
3411 xorps $inout1,$checksum
3417 pxor $inout3,$inout3
3421 movdqa @offset[2],@offset[5]
3422 movups $inout0,`16*0`($out) # store output
3423 xorps $inout0,$checksum # accumulate checksum
3424 movups $inout1,`16*1`($out)
3425 xorps $inout1,$checksum
3426 movups $inout2,`16*2`($out)
3427 xorps $inout2,$checksum
3435 movdqa @offset[3],@offset[5]
3436 movups $inout0,`16*0`($out) # store output
3437 pxor $inout0,$checksum # accumulate checksum
3438 movups $inout1,`16*1`($out)
3439 pxor $inout1,$checksum
3440 movups $inout2,`16*2`($out)
3441 pxor $inout2,$checksum
3442 movups $inout3,`16*3`($out)
3443 pxor $inout3,$checksum
3446 pxor $rndkey0,@offset[5] # "remove" round[last]
3447 movdqu $checksum,($checksum_p) # store checksum
3448 movdqu @offset[5],($offset_p) # store last offset_i
3450 xorps %xmm0,%xmm0 # clear register bank
3457 $code.=<<___ if (!$win64);
3471 $code.=<<___ if ($win64);
3472 movaps 0x00(%rsp),%xmm6
3473 movaps %xmm0,0x00(%rsp) # clear stack
3474 movaps 0x10(%rsp),%xmm7
3475 movaps %xmm0,0x10(%rsp)
3476 movaps 0x20(%rsp),%xmm8
3477 movaps %xmm0,0x20(%rsp)
3478 movaps 0x30(%rsp),%xmm9
3479 movaps %xmm0,0x30(%rsp)
3480 movaps 0x40(%rsp),%xmm10
3481 movaps %xmm0,0x40(%rsp)
3482 movaps 0x50(%rsp),%xmm11
3483 movaps %xmm0,0x50(%rsp)
3484 movaps 0x60(%rsp),%xmm12
3485 movaps %xmm0,0x60(%rsp)
3486 movaps 0x70(%rsp),%xmm13
3487 movaps %xmm0,0x70(%rsp)
3488 movaps 0x80(%rsp),%xmm14
3489 movaps %xmm0,0x80(%rsp)
3490 movaps 0x90(%rsp),%xmm15
3491 movaps %xmm0,0x90(%rsp)
3492 lea 0xa0+0x28(%rsp),%rax
3507 .cfi_def_cfa_register %rsp
3511 .size aesni_ocb_decrypt,.-aesni_ocb_decrypt
3513 .type __ocb_decrypt6,\@abi-omnipotent
3516 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3517 movdqu ($L_p,$i1),@offset[1]
3518 movdqa @offset[0],@offset[2]
3519 movdqu ($L_p,$i3),@offset[3]
3520 movdqa @offset[0],@offset[4]
3521 pxor @offset[5],@offset[0]
3522 movdqu ($L_p,$i5),@offset[5]
3523 pxor @offset[0],@offset[1]
3524 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3525 pxor @offset[1],@offset[2]
3526 pxor @offset[1],$inout1
3527 pxor @offset[2],@offset[3]
3528 pxor @offset[2],$inout2
3529 pxor @offset[3],@offset[4]
3530 pxor @offset[3],$inout3
3531 pxor @offset[4],@offset[5]
3532 pxor @offset[4],$inout4
3533 pxor @offset[5],$inout5
3534 $movkey 32($key_),$rndkey0
3536 lea 1($block_num),$i1 # even-numbered blocks
3537 lea 3($block_num),$i3
3538 lea 5($block_num),$i5
3540 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3541 bsf $i1,$i1 # ntz(block)
3545 aesdec $rndkey1,$inout0
3546 aesdec $rndkey1,$inout1
3547 aesdec $rndkey1,$inout2
3548 aesdec $rndkey1,$inout3
3549 pxor $rndkey0l,@offset[1]
3550 pxor $rndkey0l,@offset[2]
3551 aesdec $rndkey1,$inout4
3552 pxor $rndkey0l,@offset[3]
3553 pxor $rndkey0l,@offset[4]
3554 aesdec $rndkey1,$inout5
3555 $movkey 48($key_),$rndkey1
3556 pxor $rndkey0l,@offset[5]
3558 aesdec $rndkey0,$inout0
3559 aesdec $rndkey0,$inout1
3560 aesdec $rndkey0,$inout2
3561 aesdec $rndkey0,$inout3
3562 aesdec $rndkey0,$inout4
3563 aesdec $rndkey0,$inout5
3564 $movkey 64($key_),$rndkey0
3565 shl \$4,$i1 # ntz(block) -> table offset
3571 aesdec $rndkey1,$inout0
3572 aesdec $rndkey1,$inout1
3573 aesdec $rndkey1,$inout2
3574 aesdec $rndkey1,$inout3
3575 aesdec $rndkey1,$inout4
3576 aesdec $rndkey1,$inout5
3577 $movkey ($key,%rax),$rndkey1
3580 aesdec $rndkey0,$inout0
3581 aesdec $rndkey0,$inout1
3582 aesdec $rndkey0,$inout2
3583 aesdec $rndkey0,$inout3
3584 aesdec $rndkey0,$inout4
3585 aesdec $rndkey0,$inout5
3586 $movkey -16($key,%rax),$rndkey0
3589 aesdec $rndkey1,$inout0
3590 aesdec $rndkey1,$inout1
3591 aesdec $rndkey1,$inout2
3592 aesdec $rndkey1,$inout3
3593 aesdec $rndkey1,$inout4
3594 aesdec $rndkey1,$inout5
3595 $movkey 16($key_),$rndkey1
3598 aesdeclast @offset[0],$inout0
3599 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3600 mov %r10,%rax # restore twisted rounds
3601 aesdeclast @offset[1],$inout1
3602 aesdeclast @offset[2],$inout2
3603 aesdeclast @offset[3],$inout3
3604 aesdeclast @offset[4],$inout4
3605 aesdeclast @offset[5],$inout5
3607 .size __ocb_decrypt6,.-__ocb_decrypt6
3609 .type __ocb_decrypt4,\@abi-omnipotent
3612 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3613 movdqu ($L_p,$i1),@offset[1]
3614 movdqa @offset[0],@offset[2]
3615 movdqu ($L_p,$i3),@offset[3]
3616 pxor @offset[5],@offset[0]
3617 pxor @offset[0],@offset[1]
3618 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3619 pxor @offset[1],@offset[2]
3620 pxor @offset[1],$inout1
3621 pxor @offset[2],@offset[3]
3622 pxor @offset[2],$inout2
3623 pxor @offset[3],$inout3
3624 $movkey 32($key_),$rndkey0
3626 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3627 pxor $rndkey0l,@offset[1]
3628 pxor $rndkey0l,@offset[2]
3629 pxor $rndkey0l,@offset[3]
3631 aesdec $rndkey1,$inout0
3632 aesdec $rndkey1,$inout1
3633 aesdec $rndkey1,$inout2
3634 aesdec $rndkey1,$inout3
3635 $movkey 48($key_),$rndkey1
3637 aesdec $rndkey0,$inout0
3638 aesdec $rndkey0,$inout1
3639 aesdec $rndkey0,$inout2
3640 aesdec $rndkey0,$inout3
3641 $movkey 64($key_),$rndkey0
3646 aesdec $rndkey1,$inout0
3647 aesdec $rndkey1,$inout1
3648 aesdec $rndkey1,$inout2
3649 aesdec $rndkey1,$inout3
3650 $movkey ($key,%rax),$rndkey1
3653 aesdec $rndkey0,$inout0
3654 aesdec $rndkey0,$inout1
3655 aesdec $rndkey0,$inout2
3656 aesdec $rndkey0,$inout3
3657 $movkey -16($key,%rax),$rndkey0
3660 aesdec $rndkey1,$inout0
3661 aesdec $rndkey1,$inout1
3662 aesdec $rndkey1,$inout2
3663 aesdec $rndkey1,$inout3
3664 $movkey 16($key_),$rndkey1
3665 mov %r10,%rax # restore twisted rounds
3667 aesdeclast @offset[0],$inout0
3668 aesdeclast @offset[1],$inout1
3669 aesdeclast @offset[2],$inout2
3670 aesdeclast @offset[3],$inout3
3672 .size __ocb_decrypt4,.-__ocb_decrypt4
3674 .type __ocb_decrypt1,\@abi-omnipotent
3677 pxor @offset[5],$inout5 # offset_i
3678 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3679 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3680 $movkey 32($key_),$rndkey0
3682 aesdec $rndkey1,$inout0
3683 $movkey 48($key_),$rndkey1
3684 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3686 aesdec $rndkey0,$inout0
3687 $movkey 64($key_),$rndkey0
3692 aesdec $rndkey1,$inout0
3693 $movkey ($key,%rax),$rndkey1
3696 aesdec $rndkey0,$inout0
3697 $movkey -16($key,%rax),$rndkey0
3700 aesdec $rndkey1,$inout0
3701 $movkey 16($key_),$rndkey1 # redundant in tail
3702 mov %r10,%rax # restore twisted rounds
3704 aesdeclast $inout5,$inout0
3706 .size __ocb_decrypt1,.-__ocb_decrypt1
3710 ########################################################################
3711 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
3712 # size_t length, const AES_KEY *key,
3713 # unsigned char *ivp,const int enc);
3715 my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
3716 my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
3719 .globl ${PREFIX}_cbc_encrypt
3720 .type ${PREFIX}_cbc_encrypt,\@function,6
3722 ${PREFIX}_cbc_encrypt:
3724 test $len,$len # check length
3727 mov 240($key),$rnds_ # key->rounds
3728 mov $key,$key_ # backup $key
3729 test %r9d,%r9d # 6th argument
3731 #--------------------------- CBC ENCRYPT ------------------------------#
3732 movups ($ivp),$inout0 # load iv as initial state
3740 movups ($inp),$inout1 # load input
3742 #xorps $inout1,$inout0
3744 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
3746 mov $rnds_,$rounds # restore $rounds
3747 mov $key_,$key # restore $key
3748 movups $inout0,0($out) # store output
3754 pxor $rndkey0,$rndkey0 # clear register bank
3755 pxor $rndkey1,$rndkey1
3756 movups $inout0,($ivp)
3757 pxor $inout0,$inout0
3758 pxor $inout1,$inout1
3762 mov $len,%rcx # zaps $key
3763 xchg $inp,$out # $inp is %rsi and $out is %rdi now
3764 .long 0x9066A4F3 # rep movsb
3765 mov \$16,%ecx # zero tail
3768 .long 0x9066AAF3 # rep stosb
3769 lea -16(%rdi),%rdi # rewind $out by 1 block
3770 mov $rnds_,$rounds # restore $rounds
3771 mov %rdi,%rsi # $inp and $out are the same
3772 mov $key_,$key # restore $key
3773 xor $len,$len # len=16
3774 jmp .Lcbc_enc_loop # one more spin
3775 \f#--------------------------- CBC DECRYPT ------------------------------#
3779 jne .Lcbc_decrypt_bulk
3781 # handle single block without allocating stack frame,
3782 # useful in ciphertext stealing mode
3783 movdqu ($inp),$inout0 # load input
3784 movdqu ($ivp),$inout1 # load iv
3785 movdqa $inout0,$inout2 # future iv
3787 &aesni_generate1("dec",$key,$rnds_);
3789 pxor $rndkey0,$rndkey0 # clear register bank
3790 pxor $rndkey1,$rndkey1
3791 movdqu $inout2,($ivp) # store iv
3792 xorps $inout1,$inout0 # ^=iv
3793 pxor $inout1,$inout1
3794 movups $inout0,($out) # store output
3795 pxor $inout0,$inout0
3799 lea (%rsp),%r11 # frame pointer
3800 .cfi_def_cfa_register %r11
3803 sub \$$frame_size,%rsp
3804 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
3806 $code.=<<___ if ($win64);
3807 movaps %xmm6,0x10(%rsp)
3808 movaps %xmm7,0x20(%rsp)
3809 movaps %xmm8,0x30(%rsp)
3810 movaps %xmm9,0x40(%rsp)
3811 movaps %xmm10,0x50(%rsp)
3812 movaps %xmm11,0x60(%rsp)
3813 movaps %xmm12,0x70(%rsp)
3814 movaps %xmm13,0x80(%rsp)
3815 movaps %xmm14,0x90(%rsp)
3816 movaps %xmm15,0xa0(%rsp)
3820 my $inp_=$key_="%rbp"; # reassign $key_
3823 mov $key,$key_ # [re-]backup $key [after reassignment]
3829 $movkey ($key),$rndkey0
3830 movdqu 0x00($inp),$inout0 # load input
3831 movdqu 0x10($inp),$inout1
3833 movdqu 0x20($inp),$inout2
3835 movdqu 0x30($inp),$inout3
3837 movdqu 0x40($inp),$inout4
3839 movdqu 0x50($inp),$inout5
3841 mov OPENSSL_ia32cap_P+4(%rip),%r9d
3843 jbe .Lcbc_dec_six_or_seven
3845 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE
3846 sub \$0x50,$len # $len is biased by -5*16
3847 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE
3848 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont]
3849 sub \$0x20,$len # $len is biased by -7*16
3850 lea 0x70($key),$key # size optimization
3851 jmp .Lcbc_dec_loop8_enter
3854 movups $inout7,($out)
3856 .Lcbc_dec_loop8_enter:
3857 movdqu 0x60($inp),$inout6
3858 pxor $rndkey0,$inout0
3859 movdqu 0x70($inp),$inout7
3860 pxor $rndkey0,$inout1
3861 $movkey 0x10-0x70($key),$rndkey1
3862 pxor $rndkey0,$inout2
3864 cmp \$0x70,$len # is there at least 0x60 bytes ahead?
3865 pxor $rndkey0,$inout3
3866 pxor $rndkey0,$inout4
3867 pxor $rndkey0,$inout5
3868 pxor $rndkey0,$inout6
3870 aesdec $rndkey1,$inout0
3871 pxor $rndkey0,$inout7
3872 $movkey 0x20-0x70($key),$rndkey0
3873 aesdec $rndkey1,$inout1
3874 aesdec $rndkey1,$inout2
3875 aesdec $rndkey1,$inout3
3876 aesdec $rndkey1,$inout4
3877 aesdec $rndkey1,$inout5
3878 aesdec $rndkey1,$inout6
3881 aesdec $rndkey1,$inout7
3883 $movkey 0x30-0x70($key),$rndkey1
3885 for($i=1;$i<12;$i++) {
3886 my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
3887 $code.=<<___ if ($i==7);
3891 aesdec $rndkeyx,$inout0
3892 aesdec $rndkeyx,$inout1
3893 aesdec $rndkeyx,$inout2
3894 aesdec $rndkeyx,$inout3
3895 aesdec $rndkeyx,$inout4
3896 aesdec $rndkeyx,$inout5
3897 aesdec $rndkeyx,$inout6
3898 aesdec $rndkeyx,$inout7
3899 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx
3901 $code.=<<___ if ($i<6 || (!($i&1) && $i>7));
3904 $code.=<<___ if ($i==7);
3907 $code.=<<___ if ($i==9);
3910 $code.=<<___ if ($i==11);
3917 aesdec $rndkey1,$inout0
3918 aesdec $rndkey1,$inout1
3921 aesdec $rndkey1,$inout2
3922 aesdec $rndkey1,$inout3
3925 aesdec $rndkey1,$inout4
3926 aesdec $rndkey1,$inout5
3929 aesdec $rndkey1,$inout6
3930 aesdec $rndkey1,$inout7
3931 movdqu 0x50($inp),$rndkey1
3933 aesdeclast $iv,$inout0
3934 movdqu 0x60($inp),$iv # borrow $iv
3935 pxor $rndkey0,$rndkey1
3936 aesdeclast $in0,$inout1
3938 movdqu 0x70($inp),$rndkey0 # next IV
3939 aesdeclast $in1,$inout2
3941 movdqu 0x00($inp_),$in0
3942 aesdeclast $in2,$inout3
3943 aesdeclast $in3,$inout4
3944 movdqu 0x10($inp_),$in1
3945 movdqu 0x20($inp_),$in2
3946 aesdeclast $in4,$inout5
3947 aesdeclast $rndkey1,$inout6
3948 movdqu 0x30($inp_),$in3
3949 movdqu 0x40($inp_),$in4
3950 aesdeclast $iv,$inout7
3951 movdqa $rndkey0,$iv # return $iv
3952 movdqu 0x50($inp_),$rndkey1
3953 $movkey -0x70($key),$rndkey0
3955 movups $inout0,($out) # store output
3957 movups $inout1,0x10($out)
3959 movups $inout2,0x20($out)
3961 movups $inout3,0x30($out)
3963 movups $inout4,0x40($out)
3965 movups $inout5,0x50($out)
3966 movdqa $rndkey1,$inout5
3967 movups $inout6,0x60($out)
3973 movaps $inout7,$inout0
3974 lea -0x70($key),$key
3976 jle .Lcbc_dec_clear_tail_collected
3977 movups $inout7,($out)
3983 .Lcbc_dec_six_or_seven:
3987 movaps $inout5,$inout6
3988 call _aesni_decrypt6
3989 pxor $iv,$inout0 # ^= IV
3992 movdqu $inout0,($out)
3994 movdqu $inout1,0x10($out)
3995 pxor $inout1,$inout1 # clear register bank
3997 movdqu $inout2,0x20($out)
3998 pxor $inout2,$inout2
4000 movdqu $inout3,0x30($out)
4001 pxor $inout3,$inout3
4003 movdqu $inout4,0x40($out)
4004 pxor $inout4,$inout4
4006 movdqa $inout5,$inout0
4007 pxor $inout5,$inout5
4008 jmp .Lcbc_dec_tail_collected
4012 movups 0x60($inp),$inout6
4013 xorps $inout7,$inout7
4014 call _aesni_decrypt8
4015 movups 0x50($inp),$inout7
4016 pxor $iv,$inout0 # ^= IV
4017 movups 0x60($inp),$iv
4019 movdqu $inout0,($out)
4021 movdqu $inout1,0x10($out)
4022 pxor $inout1,$inout1 # clear register bank
4024 movdqu $inout2,0x20($out)
4025 pxor $inout2,$inout2
4027 movdqu $inout3,0x30($out)
4028 pxor $inout3,$inout3
4030 movdqu $inout4,0x40($out)
4031 pxor $inout4,$inout4
4032 pxor $inout7,$inout6
4033 movdqu $inout5,0x50($out)
4034 pxor $inout5,$inout5
4036 movdqa $inout6,$inout0
4037 pxor $inout6,$inout6
4038 pxor $inout7,$inout7
4039 jmp .Lcbc_dec_tail_collected
4043 movups $inout5,($out)
4045 movdqu 0x00($inp),$inout0 # load input
4046 movdqu 0x10($inp),$inout1
4048 movdqu 0x20($inp),$inout2
4050 movdqu 0x30($inp),$inout3
4052 movdqu 0x40($inp),$inout4
4054 movdqu 0x50($inp),$inout5
4056 .Lcbc_dec_loop6_enter:
4058 movdqa $inout5,$inout6
4060 call _aesni_decrypt6
4062 pxor $iv,$inout0 # ^= IV
4065 movdqu $inout0,($out)
4067 movdqu $inout1,0x10($out)
4069 movdqu $inout2,0x20($out)
4072 movdqu $inout3,0x30($out)
4075 movdqu $inout4,0x40($out)
4080 movdqa $inout5,$inout0
4082 jle .Lcbc_dec_clear_tail_collected
4083 movups $inout5,($out)
4087 movups ($inp),$inout0
4089 jbe .Lcbc_dec_one # $len is 1*16 or less
4091 movups 0x10($inp),$inout1
4094 jbe .Lcbc_dec_two # $len is 2*16 or less
4096 movups 0x20($inp),$inout2
4099 jbe .Lcbc_dec_three # $len is 3*16 or less
4101 movups 0x30($inp),$inout3
4104 jbe .Lcbc_dec_four # $len is 4*16 or less
4106 movups 0x40($inp),$inout4 # $len is 5*16 or less
4109 xorps $inout5,$inout5
4110 call _aesni_decrypt6
4114 movdqu $inout0,($out)
4116 movdqu $inout1,0x10($out)
4117 pxor $inout1,$inout1 # clear register bank
4119 movdqu $inout2,0x20($out)
4120 pxor $inout2,$inout2
4122 movdqu $inout3,0x30($out)
4123 pxor $inout3,$inout3
4125 movdqa $inout4,$inout0
4126 pxor $inout4,$inout4
4127 pxor $inout5,$inout5
4129 jmp .Lcbc_dec_tail_collected
4135 &aesni_generate1("dec",$key,$rounds);
4139 jmp .Lcbc_dec_tail_collected
4143 call _aesni_decrypt2
4147 movdqu $inout0,($out)
4148 movdqa $inout1,$inout0
4149 pxor $inout1,$inout1 # clear register bank
4151 jmp .Lcbc_dec_tail_collected
4155 call _aesni_decrypt3
4159 movdqu $inout0,($out)
4161 movdqu $inout1,0x10($out)
4162 pxor $inout1,$inout1 # clear register bank
4163 movdqa $inout2,$inout0
4164 pxor $inout2,$inout2
4166 jmp .Lcbc_dec_tail_collected
4170 call _aesni_decrypt4
4174 movdqu $inout0,($out)
4176 movdqu $inout1,0x10($out)
4177 pxor $inout1,$inout1 # clear register bank
4179 movdqu $inout2,0x20($out)
4180 pxor $inout2,$inout2
4181 movdqa $inout3,$inout0
4182 pxor $inout3,$inout3
4184 jmp .Lcbc_dec_tail_collected
4187 .Lcbc_dec_clear_tail_collected:
4188 pxor $inout1,$inout1 # clear register bank
4189 pxor $inout2,$inout2
4190 pxor $inout3,$inout3
4192 $code.=<<___ if (!$win64);
4193 pxor $inout4,$inout4 # %xmm6..9
4194 pxor $inout5,$inout5
4195 pxor $inout6,$inout6
4196 pxor $inout7,$inout7
4199 .Lcbc_dec_tail_collected:
4202 jnz .Lcbc_dec_tail_partial
4203 movups $inout0,($out)
4204 pxor $inout0,$inout0
4207 .Lcbc_dec_tail_partial:
4208 movaps $inout0,(%rsp)
4209 pxor $inout0,$inout0
4214 .long 0x9066A4F3 # rep movsb
4215 movdqa $inout0,(%rsp)
4218 xorps $rndkey0,$rndkey0 # %xmm0
4219 pxor $rndkey1,$rndkey1
4221 $code.=<<___ if ($win64);
4222 movaps 0x10(%rsp),%xmm6
4223 movaps %xmm0,0x10(%rsp) # clear stack
4224 movaps 0x20(%rsp),%xmm7
4225 movaps %xmm0,0x20(%rsp)
4226 movaps 0x30(%rsp),%xmm8
4227 movaps %xmm0,0x30(%rsp)
4228 movaps 0x40(%rsp),%xmm9
4229 movaps %xmm0,0x40(%rsp)
4230 movaps 0x50(%rsp),%xmm10
4231 movaps %xmm0,0x50(%rsp)
4232 movaps 0x60(%rsp),%xmm11
4233 movaps %xmm0,0x60(%rsp)
4234 movaps 0x70(%rsp),%xmm12
4235 movaps %xmm0,0x70(%rsp)
4236 movaps 0x80(%rsp),%xmm13
4237 movaps %xmm0,0x80(%rsp)
4238 movaps 0x90(%rsp),%xmm14
4239 movaps %xmm0,0x90(%rsp)
4240 movaps 0xa0(%rsp),%xmm15
4241 movaps %xmm0,0xa0(%rsp)
4247 .cfi_def_cfa_register %rsp
4251 .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
4254 # int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
4255 # int bits, AES_KEY *key)
4257 # input: $inp user-supplied key
4258 # $bits $inp length in bits
4259 # $key pointer to key schedule
4260 # output: %eax 0 denoting success, -1 or -2 - failure (see C)
4261 # *$key key schedule
4263 { my ($inp,$bits,$key) = @_4args;
4267 .globl ${PREFIX}_set_decrypt_key
4268 .type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
4270 ${PREFIX}_set_decrypt_key:
4272 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
4273 .cfi_adjust_cfa_offset 8
4274 call __aesni_set_encrypt_key
4275 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
4278 lea 16($key,$bits),$inp # points at the end of key schedule
4280 $movkey ($key),%xmm0 # just swap
4281 $movkey ($inp),%xmm1
4282 $movkey %xmm0,($inp)
4283 $movkey %xmm1,($key)
4288 $movkey ($key),%xmm0 # swap and inverse
4289 $movkey ($inp),%xmm1
4294 $movkey %xmm0,16($inp)
4295 $movkey %xmm1,-16($key)
4297 ja .Ldec_key_inverse
4299 $movkey ($key),%xmm0 # inverse middle
4302 $movkey %xmm0,($inp)
4306 .cfi_adjust_cfa_offset -8
4309 .LSEH_end_set_decrypt_key:
4310 .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
4313 # This is based on submission from Intel by
4318 # Aggressively optimized in respect to aeskeygenassist's critical path
4319 # and is contained in %xmm0-5 to meet Win64 ABI requirement.
4321 # int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
4322 # int bits, AES_KEY * const key);
4324 # input: $inp user-supplied key
4325 # $bits $inp length in bits
4326 # $key pointer to key schedule
4327 # output: %eax 0 denoting success, -1 or -2 - failure (see C)
4328 # $bits rounds-1 (used in aesni_set_decrypt_key)
4329 # *$key key schedule
4330 # $key pointer to key schedule (used in
4331 # aesni_set_decrypt_key)
4333 # Subroutine is frame-less, which means that only volatile registers
4334 # are used. Note that it's declared "abi-omnipotent", which means that
4335 # amount of volatile registers is smaller on Windows.
4338 .globl ${PREFIX}_set_encrypt_key
4339 .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
4341 ${PREFIX}_set_encrypt_key:
4342 __aesni_set_encrypt_key:
4344 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
4345 .cfi_adjust_cfa_offset 8
4352 mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits
4353 movups ($inp),%xmm0 # pull first 128 bits of *userKey
4354 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
4355 and OPENSSL_ia32cap_P+4(%rip),%r10d
4356 lea 16($key),%rax # %rax is used as modifiable copy of $key
4365 mov \$9,$bits # 10 rounds for 128-bit key
4366 cmp \$`1<<28`,%r10d # AVX, bit no XOP
4369 $movkey %xmm0,($key) # round 0
4370 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
4371 call .Lkey_expansion_128_cold
4372 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
4373 call .Lkey_expansion_128
4374 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
4375 call .Lkey_expansion_128
4376 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
4377 call .Lkey_expansion_128
4378 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
4379 call .Lkey_expansion_128
4380 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
4381 call .Lkey_expansion_128
4382 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
4383 call .Lkey_expansion_128
4384 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
4385 call .Lkey_expansion_128
4386 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
4387 call .Lkey_expansion_128
4388 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
4389 call .Lkey_expansion_128
4390 $movkey %xmm0,(%rax)
4391 mov $bits,80(%rax) # 240(%rdx)
4397 movdqa .Lkey_rotate(%rip),%xmm5
4399 movdqa .Lkey_rcon1(%rip),%xmm4
4407 aesenclast %xmm4,%xmm0
4420 movdqu %xmm0,-16(%rax)
4426 movdqa .Lkey_rcon1b(%rip),%xmm4
4429 aesenclast %xmm4,%xmm0
4445 aesenclast %xmm4,%xmm0
4456 movdqu %xmm0,16(%rax)
4458 mov $bits,96(%rax) # 240($key)
4464 movq 16($inp),%xmm2 # remaining 1/3 of *userKey
4465 mov \$11,$bits # 12 rounds for 192
4466 cmp \$`1<<28`,%r10d # AVX, but no XOP
4469 $movkey %xmm0,($key) # round 0
4470 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
4471 call .Lkey_expansion_192a_cold
4472 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
4473 call .Lkey_expansion_192b
4474 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
4475 call .Lkey_expansion_192a
4476 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
4477 call .Lkey_expansion_192b
4478 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
4479 call .Lkey_expansion_192a
4480 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
4481 call .Lkey_expansion_192b
4482 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
4483 call .Lkey_expansion_192a
4484 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
4485 call .Lkey_expansion_192b
4486 $movkey %xmm0,(%rax)
4487 mov $bits,48(%rax) # 240(%rdx)
4493 movdqa .Lkey_rotate192(%rip),%xmm5
4494 movdqa .Lkey_rcon1(%rip),%xmm4
4504 aesenclast %xmm4,%xmm2
4516 pshufd \$0xff,%xmm0,%xmm3
4523 movdqu %xmm0,-16(%rax)
4528 mov $bits,32(%rax) # 240($key)
4534 movups 16($inp),%xmm2 # remaining half of *userKey
4535 mov \$13,$bits # 14 rounds for 256
4537 cmp \$`1<<28`,%r10d # AVX, but no XOP
4540 $movkey %xmm0,($key) # round 0
4541 $movkey %xmm2,16($key) # round 1
4542 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
4543 call .Lkey_expansion_256a_cold
4544 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
4545 call .Lkey_expansion_256b
4546 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
4547 call .Lkey_expansion_256a
4548 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
4549 call .Lkey_expansion_256b
4550 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
4551 call .Lkey_expansion_256a
4552 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
4553 call .Lkey_expansion_256b
4554 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
4555 call .Lkey_expansion_256a
4556 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
4557 call .Lkey_expansion_256b
4558 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
4559 call .Lkey_expansion_256a
4560 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
4561 call .Lkey_expansion_256b
4562 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
4563 call .Lkey_expansion_256a
4564 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
4565 call .Lkey_expansion_256b
4566 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
4567 call .Lkey_expansion_256a
4568 $movkey %xmm0,(%rax)
4569 mov $bits,16(%rax) # 240(%rdx)
4575 movdqa .Lkey_rotate(%rip),%xmm5
4576 movdqa .Lkey_rcon1(%rip),%xmm4
4578 movdqu %xmm0,0($key)
4580 movdqu %xmm2,16($key)
4586 aesenclast %xmm4,%xmm2
4603 pshufd \$0xff,%xmm0,%xmm2
4605 aesenclast %xmm3,%xmm2
4616 movdqu %xmm2,16(%rax)
4623 mov $bits,16(%rax) # 240($key)
4638 .cfi_adjust_cfa_offset -8
4641 .LSEH_end_set_encrypt_key:
4644 .Lkey_expansion_128:
4645 $movkey %xmm0,(%rax)
4647 .Lkey_expansion_128_cold:
4648 shufps \$0b00010000,%xmm0,%xmm4
4650 shufps \$0b10001100,%xmm0,%xmm4
4652 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4657 .Lkey_expansion_192a:
4658 $movkey %xmm0,(%rax)
4660 .Lkey_expansion_192a_cold:
4662 .Lkey_expansion_192b_warm:
4663 shufps \$0b00010000,%xmm0,%xmm4
4666 shufps \$0b10001100,%xmm0,%xmm4
4669 pshufd \$0b01010101,%xmm1,%xmm1 # critical path
4672 pshufd \$0b11111111,%xmm0,%xmm3
4677 .Lkey_expansion_192b:
4679 shufps \$0b01000100,%xmm0,%xmm5
4680 $movkey %xmm5,(%rax)
4681 shufps \$0b01001110,%xmm2,%xmm3
4682 $movkey %xmm3,16(%rax)
4684 jmp .Lkey_expansion_192b_warm
4687 .Lkey_expansion_256a:
4688 $movkey %xmm2,(%rax)
4690 .Lkey_expansion_256a_cold:
4691 shufps \$0b00010000,%xmm0,%xmm4
4693 shufps \$0b10001100,%xmm0,%xmm4
4695 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4700 .Lkey_expansion_256b:
4701 $movkey %xmm0,(%rax)
4704 shufps \$0b00010000,%xmm2,%xmm4
4706 shufps \$0b10001100,%xmm2,%xmm4
4708 shufps \$0b10101010,%xmm1,%xmm1 # critical path
4711 .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
4712 .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
4719 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
4727 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4729 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
4731 .long 0x04070605,0x04070605,0x04070605,0x04070605
4735 .long 0x1b,0x1b,0x1b,0x1b
4737 .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
4741 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4742 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
4750 .extern __imp_RtlVirtualUnwind
4752 $code.=<<___ if ($PREFIX eq "aesni");
4753 .type ecb_ccm64_se_handler,\@abi-omnipotent
4755 ecb_ccm64_se_handler:
4767 mov 120($context),%rax # pull context->Rax
4768 mov 248($context),%rbx # pull context->Rip
4770 mov 8($disp),%rsi # disp->ImageBase
4771 mov 56($disp),%r11 # disp->HandlerData
4773 mov 0(%r11),%r10d # HandlerData[0]
4774 lea (%rsi,%r10),%r10 # prologue label
4775 cmp %r10,%rbx # context->Rip<prologue label
4776 jb .Lcommon_seh_tail
4778 mov 152($context),%rax # pull context->Rsp
4780 mov 4(%r11),%r10d # HandlerData[1]
4781 lea (%rsi,%r10),%r10 # epilogue label
4782 cmp %r10,%rbx # context->Rip>=epilogue label
4783 jae .Lcommon_seh_tail
4785 lea 0(%rax),%rsi # %xmm save area
4786 lea 512($context),%rdi # &context.Xmm6
4787 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
4788 .long 0xa548f3fc # cld; rep movsq
4789 lea 0x58(%rax),%rax # adjust stack pointer
4791 jmp .Lcommon_seh_tail
4792 .size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
4794 .type ctr_xts_se_handler,\@abi-omnipotent
4808 mov 120($context),%rax # pull context->Rax
4809 mov 248($context),%rbx # pull context->Rip
4811 mov 8($disp),%rsi # disp->ImageBase
4812 mov 56($disp),%r11 # disp->HandlerData
4814 mov 0(%r11),%r10d # HandlerData[0]
4815 lea (%rsi,%r10),%r10 # prologue lable
4816 cmp %r10,%rbx # context->Rip<prologue label
4817 jb .Lcommon_seh_tail
4819 mov 152($context),%rax # pull context->Rsp
4821 mov 4(%r11),%r10d # HandlerData[1]
4822 lea (%rsi,%r10),%r10 # epilogue label
4823 cmp %r10,%rbx # context->Rip>=epilogue label
4824 jae .Lcommon_seh_tail
4826 mov 208($context),%rax # pull context->R11
4828 lea -0xa8(%rax),%rsi # %xmm save area
4829 lea 512($context),%rdi # & context.Xmm6
4830 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4831 .long 0xa548f3fc # cld; rep movsq
4833 mov -8(%rax),%rbp # restore saved %rbp
4834 mov %rbp,160($context) # restore context->Rbp
4835 jmp .Lcommon_seh_tail
4836 .size ctr_xts_se_handler,.-ctr_xts_se_handler
4838 .type ocb_se_handler,\@abi-omnipotent
4852 mov 120($context),%rax # pull context->Rax
4853 mov 248($context),%rbx # pull context->Rip
4855 mov 8($disp),%rsi # disp->ImageBase
4856 mov 56($disp),%r11 # disp->HandlerData
4858 mov 0(%r11),%r10d # HandlerData[0]
4859 lea (%rsi,%r10),%r10 # prologue lable
4860 cmp %r10,%rbx # context->Rip<prologue label
4861 jb .Lcommon_seh_tail
4863 mov 4(%r11),%r10d # HandlerData[1]
4864 lea (%rsi,%r10),%r10 # epilogue label
4865 cmp %r10,%rbx # context->Rip>=epilogue label
4866 jae .Lcommon_seh_tail
4868 mov 8(%r11),%r10d # HandlerData[2]
4869 lea (%rsi,%r10),%r10
4870 cmp %r10,%rbx # context->Rip>=pop label
4873 mov 152($context),%rax # pull context->Rsp
4875 lea (%rax),%rsi # %xmm save area
4876 lea 512($context),%rdi # & context.Xmm6
4877 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4878 .long 0xa548f3fc # cld; rep movsq
4879 lea 0xa0+0x28(%rax),%rax
4888 mov %rbx,144($context) # restore context->Rbx
4889 mov %rbp,160($context) # restore context->Rbp
4890 mov %r12,216($context) # restore context->R12
4891 mov %r13,224($context) # restore context->R13
4892 mov %r14,232($context) # restore context->R14
4894 jmp .Lcommon_seh_tail
4895 .size ocb_se_handler,.-ocb_se_handler
4898 .type cbc_se_handler,\@abi-omnipotent
4912 mov 152($context),%rax # pull context->Rsp
4913 mov 248($context),%rbx # pull context->Rip
4915 lea .Lcbc_decrypt_bulk(%rip),%r10
4916 cmp %r10,%rbx # context->Rip<"prologue" label
4917 jb .Lcommon_seh_tail
4919 mov 120($context),%rax # pull context->Rax
4921 lea .Lcbc_decrypt_body(%rip),%r10
4922 cmp %r10,%rbx # context->Rip<cbc_decrypt_body
4923 jb .Lcommon_seh_tail
4925 mov 152($context),%rax # pull context->Rsp
4927 lea .Lcbc_ret(%rip),%r10
4928 cmp %r10,%rbx # context->Rip>="epilogue" label
4929 jae .Lcommon_seh_tail
4931 lea 16(%rax),%rsi # %xmm save area
4932 lea 512($context),%rdi # &context.Xmm6
4933 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4934 .long 0xa548f3fc # cld; rep movsq
4936 mov 208($context),%rax # pull context->R11
4938 mov -8(%rax),%rbp # restore saved %rbp
4939 mov %rbp,160($context) # restore context->Rbp
4944 mov %rax,152($context) # restore context->Rsp
4945 mov %rsi,168($context) # restore context->Rsi
4946 mov %rdi,176($context) # restore context->Rdi
4948 mov 40($disp),%rdi # disp->ContextRecord
4949 mov $context,%rsi # context
4950 mov \$154,%ecx # sizeof(CONTEXT)
4951 .long 0xa548f3fc # cld; rep movsq
4954 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
4955 mov 8(%rsi),%rdx # arg2, disp->ImageBase
4956 mov 0(%rsi),%r8 # arg3, disp->ControlPc
4957 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4958 mov 40(%rsi),%r10 # disp->ContextRecord
4959 lea 56(%rsi),%r11 # &disp->HandlerData
4960 lea 24(%rsi),%r12 # &disp->EstablisherFrame
4961 mov %r10,32(%rsp) # arg5
4962 mov %r11,40(%rsp) # arg6
4963 mov %r12,48(%rsp) # arg7
4964 mov %rcx,56(%rsp) # arg8, (NULL)
4965 call *__imp_RtlVirtualUnwind(%rip)
4967 mov \$1,%eax # ExceptionContinueSearch
4979 .size cbc_se_handler,.-cbc_se_handler
4984 $code.=<<___ if ($PREFIX eq "aesni");
4985 .rva .LSEH_begin_aesni_ecb_encrypt
4986 .rva .LSEH_end_aesni_ecb_encrypt
4989 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
4990 .rva .LSEH_end_aesni_ccm64_encrypt_blocks
4991 .rva .LSEH_info_ccm64_enc
4993 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
4994 .rva .LSEH_end_aesni_ccm64_decrypt_blocks
4995 .rva .LSEH_info_ccm64_dec
4997 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
4998 .rva .LSEH_end_aesni_ctr32_encrypt_blocks
4999 .rva .LSEH_info_ctr32
5001 .rva .LSEH_begin_aesni_xts_encrypt
5002 .rva .LSEH_end_aesni_xts_encrypt
5003 .rva .LSEH_info_xts_enc
5005 .rva .LSEH_begin_aesni_xts_decrypt
5006 .rva .LSEH_end_aesni_xts_decrypt
5007 .rva .LSEH_info_xts_dec
5009 .rva .LSEH_begin_aesni_ocb_encrypt
5010 .rva .LSEH_end_aesni_ocb_encrypt
5011 .rva .LSEH_info_ocb_enc
5013 .rva .LSEH_begin_aesni_ocb_decrypt
5014 .rva .LSEH_end_aesni_ocb_decrypt
5015 .rva .LSEH_info_ocb_dec
5018 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
5019 .rva .LSEH_end_${PREFIX}_cbc_encrypt
5022 .rva ${PREFIX}_set_decrypt_key
5023 .rva .LSEH_end_set_decrypt_key
5026 .rva ${PREFIX}_set_encrypt_key
5027 .rva .LSEH_end_set_encrypt_key
5032 $code.=<<___ if ($PREFIX eq "aesni");
5035 .rva ecb_ccm64_se_handler
5036 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[]
5037 .LSEH_info_ccm64_enc:
5039 .rva ecb_ccm64_se_handler
5040 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
5041 .LSEH_info_ccm64_dec:
5043 .rva ecb_ccm64_se_handler
5044 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
5047 .rva ctr_xts_se_handler
5048 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
5051 .rva ctr_xts_se_handler
5052 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
5055 .rva ctr_xts_se_handler
5056 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
5060 .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[]
5066 .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[]
5075 .byte 0x01,0x04,0x01,0x00
5076 .byte 0x04,0x02,0x00,0x00 # sub rsp,8
5081 local *opcode=shift;
5085 $rex|=0x04 if($dst>=8);
5086 $rex|=0x01 if($src>=8);
5087 push @opcode,$rex|0x40 if($rex);
5094 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5095 rex(\@opcode,$4,$3);
5096 push @opcode,0x0f,0x3a,0xdf;
5097 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
5099 push @opcode,$c=~/^0/?oct($c):$c;
5100 return ".byte\t".join(',',@opcode);
5102 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5105 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5106 "aesdec" => 0xde, "aesdeclast" => 0xdf
5108 return undef if (!defined($opcodelet{$1}));
5109 rex(\@opcode,$3,$2);
5110 push @opcode,0x0f,0x38,$opcodelet{$1};
5111 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
5112 return ".byte\t".join(',',@opcode);
5114 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
5116 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5117 "aesdec" => 0xde, "aesdeclast" => 0xdf
5119 return undef if (!defined($opcodelet{$1}));
5121 push @opcode,0x44 if ($3>=8);
5122 push @opcode,0x0f,0x38,$opcodelet{$1};
5123 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
5124 push @opcode,($off=~/^0/?oct($off):$off)&0xff;
5125 return ".byte\t".join(',',@opcode);
5131 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
5134 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
5135 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
5136 #$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
5137 $code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;