2 # Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # This module implements support for Intel AES-NI extension. In
18 # OpenSSL context it's used with Intel engine, but can also be used as
19 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
24 # To start with see corresponding paragraph in aesni-x86_64.pl...
25 # Instead of filling table similar to one found there I've chosen to
26 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
27 # The simplified table below represents 32-bit performance relative
28 # to 64-bit one in every given point. Ratios vary for different
29 # encryption modes, therefore interval values.
31 # 16-byte 64-byte 256-byte 1-KB 8-KB
32 # 53-67% 67-84% 91-94% 95-98% 97-99.5%
34 # Lower ratios for smaller block sizes are perfectly understandable,
35 # because function call overhead is higher in 32-bit mode. Largest
36 # 8-KB block performance is virtually same: 32-bit code is less than
37 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
41 # See aesni-x86_64.pl for details. Unlike x86_64 version this module
42 # interleaves at most 6 aes[enc|dec] instructions, because there are
43 # not enough registers for 8x interleave [which should be optimal for
44 # Sandy Bridge]. Actually, performance results for 6x interleave
45 # factor presented in aesni-x86_64.pl (except for CTR) are for this
50 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
51 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
55 # Add aesni_ocb_[en|de]crypt.
57 ######################################################################
58 # Current large-block performance in cycles per byte processed with
59 # 128-bit key (less is better).
61 # CBC en-/decrypt CTR XTS ECB OCB
62 # Westmere 3.77/1.37 1.37 1.52 1.27
63 # * Bridge 5.07/0.98 0.99 1.09 0.91 1.10
64 # Haswell 4.44/0.80 0.97 1.03 0.72 0.76
65 # Skylake 2.68/0.65 0.65 0.66 0.64 0.66
66 # Silvermont 5.77/3.56 3.67 4.03 3.46 4.03
67 # Goldmont 3.84/1.39 1.39 1.63 1.31 1.70
68 # Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23
70 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
71 # generates drop-in replacement for
72 # crypto/aes/asm/aes-586.pl:-)
73 $inline=1; # inline _aesni_[en|de]crypt
75 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
76 push(@INC,"${dir}","${dir}../../perlasm");
79 $output = pop and open STDOUT,">$output";
83 &external_label("OPENSSL_ia32cap_P");
84 &static_label("key_const");
86 if ($PREFIX eq "aesni") { $movekey=\&movups; }
87 else { $movekey=\&movups; }
94 $rounds_="ebx"; # backup copy for $rounds
95 $key_="ebp"; # backup copy for $key
102 $inout3="xmm5"; $in1="xmm5";
103 $inout4="xmm6"; $in0="xmm6";
104 $inout5="xmm7"; $ivec="xmm7";
108 { my($dst,$src,$imm)=@_;
109 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
110 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
113 { my($opcodelet,$dst,$src)=@_;
114 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
115 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
117 sub aesimc { aescommon(0xdb,@_); }
118 sub aesenc { aescommon(0xdc,@_); }
119 sub aesenclast { aescommon(0xdd,@_); }
120 sub aesdec { aescommon(0xde,@_); }
121 sub aesdeclast { aescommon(0xdf,@_); }
123 # Inline version of internal aesni_[en|de]crypt1
125 sub aesni_inline_generate1
126 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
129 &$movekey ($rndkey0,&QWP(0,$key));
130 &$movekey ($rndkey1,&QWP(16,$key));
131 &xorps ($ivec,$rndkey0) if (defined($ivec));
132 &lea ($key,&DWP(32,$key));
133 &xorps ($inout,$ivec) if (defined($ivec));
134 &xorps ($inout,$rndkey0) if (!defined($ivec));
135 &set_label("${p}1_loop_$sn");
136 eval"&aes${p} ($inout,$rndkey1)";
138 &$movekey ($rndkey1,&QWP(0,$key));
139 &lea ($key,&DWP(16,$key));
140 &jnz (&label("${p}1_loop_$sn"));
141 eval"&aes${p}last ($inout,$rndkey1)";
144 sub aesni_generate1 # fully unrolled loop
145 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
147 &function_begin_B("_aesni_${p}rypt1");
148 &movups ($rndkey0,&QWP(0,$key));
149 &$movekey ($rndkey1,&QWP(0x10,$key));
150 &xorps ($inout,$rndkey0);
151 &$movekey ($rndkey0,&QWP(0x20,$key));
152 &lea ($key,&DWP(0x30,$key));
154 &jb (&label("${p}128"));
155 &lea ($key,&DWP(0x20,$key));
156 &je (&label("${p}192"));
157 &lea ($key,&DWP(0x20,$key));
158 eval"&aes${p} ($inout,$rndkey1)";
159 &$movekey ($rndkey1,&QWP(-0x40,$key));
160 eval"&aes${p} ($inout,$rndkey0)";
161 &$movekey ($rndkey0,&QWP(-0x30,$key));
162 &set_label("${p}192");
163 eval"&aes${p} ($inout,$rndkey1)";
164 &$movekey ($rndkey1,&QWP(-0x20,$key));
165 eval"&aes${p} ($inout,$rndkey0)";
166 &$movekey ($rndkey0,&QWP(-0x10,$key));
167 &set_label("${p}128");
168 eval"&aes${p} ($inout,$rndkey1)";
169 &$movekey ($rndkey1,&QWP(0,$key));
170 eval"&aes${p} ($inout,$rndkey0)";
171 &$movekey ($rndkey0,&QWP(0x10,$key));
172 eval"&aes${p} ($inout,$rndkey1)";
173 &$movekey ($rndkey1,&QWP(0x20,$key));
174 eval"&aes${p} ($inout,$rndkey0)";
175 &$movekey ($rndkey0,&QWP(0x30,$key));
176 eval"&aes${p} ($inout,$rndkey1)";
177 &$movekey ($rndkey1,&QWP(0x40,$key));
178 eval"&aes${p} ($inout,$rndkey0)";
179 &$movekey ($rndkey0,&QWP(0x50,$key));
180 eval"&aes${p} ($inout,$rndkey1)";
181 &$movekey ($rndkey1,&QWP(0x60,$key));
182 eval"&aes${p} ($inout,$rndkey0)";
183 &$movekey ($rndkey0,&QWP(0x70,$key));
184 eval"&aes${p} ($inout,$rndkey1)";
185 eval"&aes${p}last ($inout,$rndkey0)";
187 &function_end_B("_aesni_${p}rypt1");
190 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
191 &aesni_generate1("enc") if (!$inline);
192 &function_begin_B("${PREFIX}_encrypt");
193 &mov ("eax",&wparam(0));
194 &mov ($key,&wparam(2));
195 &movups ($inout0,&QWP(0,"eax"));
196 &mov ($rounds,&DWP(240,$key));
197 &mov ("eax",&wparam(1));
199 { &aesni_inline_generate1("enc"); }
201 { &call ("_aesni_encrypt1"); }
202 &pxor ($rndkey0,$rndkey0); # clear register bank
203 &pxor ($rndkey1,$rndkey1);
204 &movups (&QWP(0,"eax"),$inout0);
205 &pxor ($inout0,$inout0);
207 &function_end_B("${PREFIX}_encrypt");
209 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
210 &aesni_generate1("dec") if(!$inline);
211 &function_begin_B("${PREFIX}_decrypt");
212 &mov ("eax",&wparam(0));
213 &mov ($key,&wparam(2));
214 &movups ($inout0,&QWP(0,"eax"));
215 &mov ($rounds,&DWP(240,$key));
216 &mov ("eax",&wparam(1));
218 { &aesni_inline_generate1("dec"); }
220 { &call ("_aesni_decrypt1"); }
221 &pxor ($rndkey0,$rndkey0); # clear register bank
222 &pxor ($rndkey1,$rndkey1);
223 &movups (&QWP(0,"eax"),$inout0);
224 &pxor ($inout0,$inout0);
226 &function_end_B("${PREFIX}_decrypt");
228 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
229 # factor. Why 3x subroutine were originally used in loops? Even though
230 # aes[enc|dec] latency was originally 6, it could be scheduled only
231 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
232 # utilization, i.e. when subroutine's throughput is virtually same as
233 # of non-interleaved subroutine [for number of input blocks up to 3].
234 # This is why it originally made no sense to implement 2x subroutine.
235 # But times change and it became appropriate to spend extra 192 bytes
236 # on 2x subroutine on Atom Silvermont account. For processors that
237 # can schedule aes[enc|dec] every cycle optimal interleave factor
238 # equals to corresponding instructions latency. 8x is optimal for
239 # * Bridge, but it's unfeasible to accommodate such implementation
240 # in XMM registers addressable in 32-bit mode and therefore maximum
241 # of 6x is used instead...
246 &function_begin_B("_aesni_${p}rypt2");
247 &$movekey ($rndkey0,&QWP(0,$key));
249 &$movekey ($rndkey1,&QWP(16,$key));
250 &xorps ($inout0,$rndkey0);
251 &pxor ($inout1,$rndkey0);
252 &$movekey ($rndkey0,&QWP(32,$key));
253 &lea ($key,&DWP(32,$key,$rounds));
257 &set_label("${p}2_loop");
258 eval"&aes${p} ($inout0,$rndkey1)";
259 eval"&aes${p} ($inout1,$rndkey1)";
260 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
262 eval"&aes${p} ($inout0,$rndkey0)";
263 eval"&aes${p} ($inout1,$rndkey0)";
264 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
265 &jnz (&label("${p}2_loop"));
266 eval"&aes${p} ($inout0,$rndkey1)";
267 eval"&aes${p} ($inout1,$rndkey1)";
268 eval"&aes${p}last ($inout0,$rndkey0)";
269 eval"&aes${p}last ($inout1,$rndkey0)";
271 &function_end_B("_aesni_${p}rypt2");
277 &function_begin_B("_aesni_${p}rypt3");
278 &$movekey ($rndkey0,&QWP(0,$key));
280 &$movekey ($rndkey1,&QWP(16,$key));
281 &xorps ($inout0,$rndkey0);
282 &pxor ($inout1,$rndkey0);
283 &pxor ($inout2,$rndkey0);
284 &$movekey ($rndkey0,&QWP(32,$key));
285 &lea ($key,&DWP(32,$key,$rounds));
289 &set_label("${p}3_loop");
290 eval"&aes${p} ($inout0,$rndkey1)";
291 eval"&aes${p} ($inout1,$rndkey1)";
292 eval"&aes${p} ($inout2,$rndkey1)";
293 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
295 eval"&aes${p} ($inout0,$rndkey0)";
296 eval"&aes${p} ($inout1,$rndkey0)";
297 eval"&aes${p} ($inout2,$rndkey0)";
298 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
299 &jnz (&label("${p}3_loop"));
300 eval"&aes${p} ($inout0,$rndkey1)";
301 eval"&aes${p} ($inout1,$rndkey1)";
302 eval"&aes${p} ($inout2,$rndkey1)";
303 eval"&aes${p}last ($inout0,$rndkey0)";
304 eval"&aes${p}last ($inout1,$rndkey0)";
305 eval"&aes${p}last ($inout2,$rndkey0)";
307 &function_end_B("_aesni_${p}rypt3");
310 # 4x interleave is implemented to improve small block performance,
311 # most notably [and naturally] 4 block by ~30%. One can argue that one
312 # should have implemented 5x as well, but improvement would be <20%,
313 # so it's not worth it...
317 &function_begin_B("_aesni_${p}rypt4");
318 &$movekey ($rndkey0,&QWP(0,$key));
319 &$movekey ($rndkey1,&QWP(16,$key));
321 &xorps ($inout0,$rndkey0);
322 &pxor ($inout1,$rndkey0);
323 &pxor ($inout2,$rndkey0);
324 &pxor ($inout3,$rndkey0);
325 &$movekey ($rndkey0,&QWP(32,$key));
326 &lea ($key,&DWP(32,$key,$rounds));
328 &data_byte (0x0f,0x1f,0x40,0x00);
331 &set_label("${p}4_loop");
332 eval"&aes${p} ($inout0,$rndkey1)";
333 eval"&aes${p} ($inout1,$rndkey1)";
334 eval"&aes${p} ($inout2,$rndkey1)";
335 eval"&aes${p} ($inout3,$rndkey1)";
336 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
338 eval"&aes${p} ($inout0,$rndkey0)";
339 eval"&aes${p} ($inout1,$rndkey0)";
340 eval"&aes${p} ($inout2,$rndkey0)";
341 eval"&aes${p} ($inout3,$rndkey0)";
342 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
343 &jnz (&label("${p}4_loop"));
345 eval"&aes${p} ($inout0,$rndkey1)";
346 eval"&aes${p} ($inout1,$rndkey1)";
347 eval"&aes${p} ($inout2,$rndkey1)";
348 eval"&aes${p} ($inout3,$rndkey1)";
349 eval"&aes${p}last ($inout0,$rndkey0)";
350 eval"&aes${p}last ($inout1,$rndkey0)";
351 eval"&aes${p}last ($inout2,$rndkey0)";
352 eval"&aes${p}last ($inout3,$rndkey0)";
354 &function_end_B("_aesni_${p}rypt4");
360 &function_begin_B("_aesni_${p}rypt6");
361 &static_label("_aesni_${p}rypt6_enter");
362 &$movekey ($rndkey0,&QWP(0,$key));
364 &$movekey ($rndkey1,&QWP(16,$key));
365 &xorps ($inout0,$rndkey0);
366 &pxor ($inout1,$rndkey0); # pxor does better here
367 &pxor ($inout2,$rndkey0);
368 eval"&aes${p} ($inout0,$rndkey1)";
369 &pxor ($inout3,$rndkey0);
370 &pxor ($inout4,$rndkey0);
371 eval"&aes${p} ($inout1,$rndkey1)";
372 &lea ($key,&DWP(32,$key,$rounds));
374 eval"&aes${p} ($inout2,$rndkey1)";
375 &pxor ($inout5,$rndkey0);
376 &$movekey ($rndkey0,&QWP(0,$key,$rounds));
378 &jmp (&label("_aesni_${p}rypt6_inner"));
380 &set_label("${p}6_loop",16);
381 eval"&aes${p} ($inout0,$rndkey1)";
382 eval"&aes${p} ($inout1,$rndkey1)";
383 eval"&aes${p} ($inout2,$rndkey1)";
384 &set_label("_aesni_${p}rypt6_inner");
385 eval"&aes${p} ($inout3,$rndkey1)";
386 eval"&aes${p} ($inout4,$rndkey1)";
387 eval"&aes${p} ($inout5,$rndkey1)";
388 &set_label("_aesni_${p}rypt6_enter");
389 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
391 eval"&aes${p} ($inout0,$rndkey0)";
392 eval"&aes${p} ($inout1,$rndkey0)";
393 eval"&aes${p} ($inout2,$rndkey0)";
394 eval"&aes${p} ($inout3,$rndkey0)";
395 eval"&aes${p} ($inout4,$rndkey0)";
396 eval"&aes${p} ($inout5,$rndkey0)";
397 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
398 &jnz (&label("${p}6_loop"));
400 eval"&aes${p} ($inout0,$rndkey1)";
401 eval"&aes${p} ($inout1,$rndkey1)";
402 eval"&aes${p} ($inout2,$rndkey1)";
403 eval"&aes${p} ($inout3,$rndkey1)";
404 eval"&aes${p} ($inout4,$rndkey1)";
405 eval"&aes${p} ($inout5,$rndkey1)";
406 eval"&aes${p}last ($inout0,$rndkey0)";
407 eval"&aes${p}last ($inout1,$rndkey0)";
408 eval"&aes${p}last ($inout2,$rndkey0)";
409 eval"&aes${p}last ($inout3,$rndkey0)";
410 eval"&aes${p}last ($inout4,$rndkey0)";
411 eval"&aes${p}last ($inout5,$rndkey0)";
413 &function_end_B("_aesni_${p}rypt6");
415 &aesni_generate2("enc") if ($PREFIX eq "aesni");
416 &aesni_generate2("dec");
417 &aesni_generate3("enc") if ($PREFIX eq "aesni");
418 &aesni_generate3("dec");
419 &aesni_generate4("enc") if ($PREFIX eq "aesni");
420 &aesni_generate4("dec");
421 &aesni_generate6("enc") if ($PREFIX eq "aesni");
422 &aesni_generate6("dec");
424 if ($PREFIX eq "aesni") {
425 ######################################################################
426 # void aesni_ecb_encrypt (const void *in, void *out,
427 # size_t length, const AES_KEY *key,
429 &function_begin("aesni_ecb_encrypt");
430 &mov ($inp,&wparam(0));
431 &mov ($out,&wparam(1));
432 &mov ($len,&wparam(2));
433 &mov ($key,&wparam(3));
434 &mov ($rounds_,&wparam(4));
436 &jz (&label("ecb_ret"));
437 &mov ($rounds,&DWP(240,$key));
438 &test ($rounds_,$rounds_);
439 &jz (&label("ecb_decrypt"));
441 &mov ($key_,$key); # backup $key
442 &mov ($rounds_,$rounds); # backup $rounds
444 &jb (&label("ecb_enc_tail"));
446 &movdqu ($inout0,&QWP(0,$inp));
447 &movdqu ($inout1,&QWP(0x10,$inp));
448 &movdqu ($inout2,&QWP(0x20,$inp));
449 &movdqu ($inout3,&QWP(0x30,$inp));
450 &movdqu ($inout4,&QWP(0x40,$inp));
451 &movdqu ($inout5,&QWP(0x50,$inp));
452 &lea ($inp,&DWP(0x60,$inp));
454 &jmp (&label("ecb_enc_loop6_enter"));
456 &set_label("ecb_enc_loop6",16);
457 &movups (&QWP(0,$out),$inout0);
458 &movdqu ($inout0,&QWP(0,$inp));
459 &movups (&QWP(0x10,$out),$inout1);
460 &movdqu ($inout1,&QWP(0x10,$inp));
461 &movups (&QWP(0x20,$out),$inout2);
462 &movdqu ($inout2,&QWP(0x20,$inp));
463 &movups (&QWP(0x30,$out),$inout3);
464 &movdqu ($inout3,&QWP(0x30,$inp));
465 &movups (&QWP(0x40,$out),$inout4);
466 &movdqu ($inout4,&QWP(0x40,$inp));
467 &movups (&QWP(0x50,$out),$inout5);
468 &lea ($out,&DWP(0x60,$out));
469 &movdqu ($inout5,&QWP(0x50,$inp));
470 &lea ($inp,&DWP(0x60,$inp));
471 &set_label("ecb_enc_loop6_enter");
473 &call ("_aesni_encrypt6");
475 &mov ($key,$key_); # restore $key
476 &mov ($rounds,$rounds_); # restore $rounds
478 &jnc (&label("ecb_enc_loop6"));
480 &movups (&QWP(0,$out),$inout0);
481 &movups (&QWP(0x10,$out),$inout1);
482 &movups (&QWP(0x20,$out),$inout2);
483 &movups (&QWP(0x30,$out),$inout3);
484 &movups (&QWP(0x40,$out),$inout4);
485 &movups (&QWP(0x50,$out),$inout5);
486 &lea ($out,&DWP(0x60,$out));
488 &jz (&label("ecb_ret"));
490 &set_label("ecb_enc_tail");
491 &movups ($inout0,&QWP(0,$inp));
493 &jb (&label("ecb_enc_one"));
494 &movups ($inout1,&QWP(0x10,$inp));
495 &je (&label("ecb_enc_two"));
496 &movups ($inout2,&QWP(0x20,$inp));
498 &jb (&label("ecb_enc_three"));
499 &movups ($inout3,&QWP(0x30,$inp));
500 &je (&label("ecb_enc_four"));
501 &movups ($inout4,&QWP(0x40,$inp));
502 &xorps ($inout5,$inout5);
503 &call ("_aesni_encrypt6");
504 &movups (&QWP(0,$out),$inout0);
505 &movups (&QWP(0x10,$out),$inout1);
506 &movups (&QWP(0x20,$out),$inout2);
507 &movups (&QWP(0x30,$out),$inout3);
508 &movups (&QWP(0x40,$out),$inout4);
509 jmp (&label("ecb_ret"));
511 &set_label("ecb_enc_one",16);
513 { &aesni_inline_generate1("enc"); }
515 { &call ("_aesni_encrypt1"); }
516 &movups (&QWP(0,$out),$inout0);
517 &jmp (&label("ecb_ret"));
519 &set_label("ecb_enc_two",16);
520 &call ("_aesni_encrypt2");
521 &movups (&QWP(0,$out),$inout0);
522 &movups (&QWP(0x10,$out),$inout1);
523 &jmp (&label("ecb_ret"));
525 &set_label("ecb_enc_three",16);
526 &call ("_aesni_encrypt3");
527 &movups (&QWP(0,$out),$inout0);
528 &movups (&QWP(0x10,$out),$inout1);
529 &movups (&QWP(0x20,$out),$inout2);
530 &jmp (&label("ecb_ret"));
532 &set_label("ecb_enc_four",16);
533 &call ("_aesni_encrypt4");
534 &movups (&QWP(0,$out),$inout0);
535 &movups (&QWP(0x10,$out),$inout1);
536 &movups (&QWP(0x20,$out),$inout2);
537 &movups (&QWP(0x30,$out),$inout3);
538 &jmp (&label("ecb_ret"));
539 ######################################################################
540 &set_label("ecb_decrypt",16);
541 &mov ($key_,$key); # backup $key
542 &mov ($rounds_,$rounds); # backup $rounds
544 &jb (&label("ecb_dec_tail"));
546 &movdqu ($inout0,&QWP(0,$inp));
547 &movdqu ($inout1,&QWP(0x10,$inp));
548 &movdqu ($inout2,&QWP(0x20,$inp));
549 &movdqu ($inout3,&QWP(0x30,$inp));
550 &movdqu ($inout4,&QWP(0x40,$inp));
551 &movdqu ($inout5,&QWP(0x50,$inp));
552 &lea ($inp,&DWP(0x60,$inp));
554 &jmp (&label("ecb_dec_loop6_enter"));
556 &set_label("ecb_dec_loop6",16);
557 &movups (&QWP(0,$out),$inout0);
558 &movdqu ($inout0,&QWP(0,$inp));
559 &movups (&QWP(0x10,$out),$inout1);
560 &movdqu ($inout1,&QWP(0x10,$inp));
561 &movups (&QWP(0x20,$out),$inout2);
562 &movdqu ($inout2,&QWP(0x20,$inp));
563 &movups (&QWP(0x30,$out),$inout3);
564 &movdqu ($inout3,&QWP(0x30,$inp));
565 &movups (&QWP(0x40,$out),$inout4);
566 &movdqu ($inout4,&QWP(0x40,$inp));
567 &movups (&QWP(0x50,$out),$inout5);
568 &lea ($out,&DWP(0x60,$out));
569 &movdqu ($inout5,&QWP(0x50,$inp));
570 &lea ($inp,&DWP(0x60,$inp));
571 &set_label("ecb_dec_loop6_enter");
573 &call ("_aesni_decrypt6");
575 &mov ($key,$key_); # restore $key
576 &mov ($rounds,$rounds_); # restore $rounds
578 &jnc (&label("ecb_dec_loop6"));
580 &movups (&QWP(0,$out),$inout0);
581 &movups (&QWP(0x10,$out),$inout1);
582 &movups (&QWP(0x20,$out),$inout2);
583 &movups (&QWP(0x30,$out),$inout3);
584 &movups (&QWP(0x40,$out),$inout4);
585 &movups (&QWP(0x50,$out),$inout5);
586 &lea ($out,&DWP(0x60,$out));
588 &jz (&label("ecb_ret"));
590 &set_label("ecb_dec_tail");
591 &movups ($inout0,&QWP(0,$inp));
593 &jb (&label("ecb_dec_one"));
594 &movups ($inout1,&QWP(0x10,$inp));
595 &je (&label("ecb_dec_two"));
596 &movups ($inout2,&QWP(0x20,$inp));
598 &jb (&label("ecb_dec_three"));
599 &movups ($inout3,&QWP(0x30,$inp));
600 &je (&label("ecb_dec_four"));
601 &movups ($inout4,&QWP(0x40,$inp));
602 &xorps ($inout5,$inout5);
603 &call ("_aesni_decrypt6");
604 &movups (&QWP(0,$out),$inout0);
605 &movups (&QWP(0x10,$out),$inout1);
606 &movups (&QWP(0x20,$out),$inout2);
607 &movups (&QWP(0x30,$out),$inout3);
608 &movups (&QWP(0x40,$out),$inout4);
609 &jmp (&label("ecb_ret"));
611 &set_label("ecb_dec_one",16);
613 { &aesni_inline_generate1("dec"); }
615 { &call ("_aesni_decrypt1"); }
616 &movups (&QWP(0,$out),$inout0);
617 &jmp (&label("ecb_ret"));
619 &set_label("ecb_dec_two",16);
620 &call ("_aesni_decrypt2");
621 &movups (&QWP(0,$out),$inout0);
622 &movups (&QWP(0x10,$out),$inout1);
623 &jmp (&label("ecb_ret"));
625 &set_label("ecb_dec_three",16);
626 &call ("_aesni_decrypt3");
627 &movups (&QWP(0,$out),$inout0);
628 &movups (&QWP(0x10,$out),$inout1);
629 &movups (&QWP(0x20,$out),$inout2);
630 &jmp (&label("ecb_ret"));
632 &set_label("ecb_dec_four",16);
633 &call ("_aesni_decrypt4");
634 &movups (&QWP(0,$out),$inout0);
635 &movups (&QWP(0x10,$out),$inout1);
636 &movups (&QWP(0x20,$out),$inout2);
637 &movups (&QWP(0x30,$out),$inout3);
639 &set_label("ecb_ret");
640 &pxor ("xmm0","xmm0"); # clear register bank
641 &pxor ("xmm1","xmm1");
642 &pxor ("xmm2","xmm2");
643 &pxor ("xmm3","xmm3");
644 &pxor ("xmm4","xmm4");
645 &pxor ("xmm5","xmm5");
646 &pxor ("xmm6","xmm6");
647 &pxor ("xmm7","xmm7");
648 &function_end("aesni_ecb_encrypt");
650 ######################################################################
651 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
652 # size_t blocks, const AES_KEY *key,
653 # const char *ivec,char *cmac);
655 # Handles only complete blocks, operates on 64-bit counter and
656 # does not update *ivec! Nor does it finalize CMAC value
657 # (see engine/eng_aesni.c for details)
660 &function_begin("aesni_ccm64_encrypt_blocks");
661 &mov ($inp,&wparam(0));
662 &mov ($out,&wparam(1));
663 &mov ($len,&wparam(2));
664 &mov ($key,&wparam(3));
665 &mov ($rounds_,&wparam(4));
666 &mov ($rounds,&wparam(5));
669 &and ("esp",-16); # align stack
670 &mov (&DWP(48,"esp"),$key_);
672 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
673 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
674 &mov ($rounds,&DWP(240,$key));
676 # compose byte-swap control mask for pshufb on stack
677 &mov (&DWP(0,"esp"),0x0c0d0e0f);
678 &mov (&DWP(4,"esp"),0x08090a0b);
679 &mov (&DWP(8,"esp"),0x04050607);
680 &mov (&DWP(12,"esp"),0x00010203);
682 # compose counter increment vector on stack
685 &mov (&DWP(16,"esp"),$rounds_);
686 &mov (&DWP(20,"esp"),$key_);
687 &mov (&DWP(24,"esp"),$key_);
688 &mov (&DWP(28,"esp"),$key_);
692 &lea ($key_,&DWP(0,$key));
693 &movdqa ($inout3,&QWP(0,"esp"));
694 &movdqa ($inout0,$ivec);
695 &lea ($key,&DWP(32,$key,$rounds));
696 &sub ($rounds_,$rounds);
697 &pshufb ($ivec,$inout3);
699 &set_label("ccm64_enc_outer");
700 &$movekey ($rndkey0,&QWP(0,$key_));
701 &mov ($rounds,$rounds_);
702 &movups ($in0,&QWP(0,$inp));
704 &xorps ($inout0,$rndkey0);
705 &$movekey ($rndkey1,&QWP(16,$key_));
706 &xorps ($rndkey0,$in0);
707 &xorps ($cmac,$rndkey0); # cmac^=inp
708 &$movekey ($rndkey0,&QWP(32,$key_));
710 &set_label("ccm64_enc2_loop");
711 &aesenc ($inout0,$rndkey1);
712 &aesenc ($cmac,$rndkey1);
713 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
715 &aesenc ($inout0,$rndkey0);
716 &aesenc ($cmac,$rndkey0);
717 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
718 &jnz (&label("ccm64_enc2_loop"));
719 &aesenc ($inout0,$rndkey1);
720 &aesenc ($cmac,$rndkey1);
721 &paddq ($ivec,&QWP(16,"esp"));
723 &aesenclast ($inout0,$rndkey0);
724 &aesenclast ($cmac,$rndkey0);
726 &lea ($inp,&DWP(16,$inp));
727 &xorps ($in0,$inout0); # inp^=E(ivec)
728 &movdqa ($inout0,$ivec);
729 &movups (&QWP(0,$out),$in0); # save output
730 &pshufb ($inout0,$inout3);
731 &lea ($out,&DWP(16,$out));
732 &jnz (&label("ccm64_enc_outer"));
734 &mov ("esp",&DWP(48,"esp"));
735 &mov ($out,&wparam(5));
736 &movups (&QWP(0,$out),$cmac);
738 &pxor ("xmm0","xmm0"); # clear register bank
739 &pxor ("xmm1","xmm1");
740 &pxor ("xmm2","xmm2");
741 &pxor ("xmm3","xmm3");
742 &pxor ("xmm4","xmm4");
743 &pxor ("xmm5","xmm5");
744 &pxor ("xmm6","xmm6");
745 &pxor ("xmm7","xmm7");
746 &function_end("aesni_ccm64_encrypt_blocks");
748 &function_begin("aesni_ccm64_decrypt_blocks");
749 &mov ($inp,&wparam(0));
750 &mov ($out,&wparam(1));
751 &mov ($len,&wparam(2));
752 &mov ($key,&wparam(3));
753 &mov ($rounds_,&wparam(4));
754 &mov ($rounds,&wparam(5));
757 &and ("esp",-16); # align stack
758 &mov (&DWP(48,"esp"),$key_);
760 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
761 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
762 &mov ($rounds,&DWP(240,$key));
764 # compose byte-swap control mask for pshufb on stack
765 &mov (&DWP(0,"esp"),0x0c0d0e0f);
766 &mov (&DWP(4,"esp"),0x08090a0b);
767 &mov (&DWP(8,"esp"),0x04050607);
768 &mov (&DWP(12,"esp"),0x00010203);
770 # compose counter increment vector on stack
773 &mov (&DWP(16,"esp"),$rounds_);
774 &mov (&DWP(20,"esp"),$key_);
775 &mov (&DWP(24,"esp"),$key_);
776 &mov (&DWP(28,"esp"),$key_);
778 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
779 &movdqa ($inout0,$ivec);
782 &mov ($rounds_,$rounds);
784 &pshufb ($ivec,$inout3);
786 { &aesni_inline_generate1("enc"); }
788 { &call ("_aesni_encrypt1"); }
791 &movups ($in0,&QWP(0,$inp)); # load inp
792 &paddq ($ivec,&QWP(16,"esp"));
793 &lea ($inp,&QWP(16,$inp));
794 &sub ($rounds,$rounds_);
795 &lea ($key,&DWP(32,$key_,$rounds_));
796 &mov ($rounds_,$rounds);
797 &jmp (&label("ccm64_dec_outer"));
799 &set_label("ccm64_dec_outer",16);
800 &xorps ($in0,$inout0); # inp ^= E(ivec)
801 &movdqa ($inout0,$ivec);
802 &movups (&QWP(0,$out),$in0); # save output
803 &lea ($out,&DWP(16,$out));
804 &pshufb ($inout0,$inout3);
807 &jz (&label("ccm64_dec_break"));
809 &$movekey ($rndkey0,&QWP(0,$key_));
810 &mov ($rounds,$rounds_);
811 &$movekey ($rndkey1,&QWP(16,$key_));
812 &xorps ($in0,$rndkey0);
813 &xorps ($inout0,$rndkey0);
814 &xorps ($cmac,$in0); # cmac^=out
815 &$movekey ($rndkey0,&QWP(32,$key_));
817 &set_label("ccm64_dec2_loop");
818 &aesenc ($inout0,$rndkey1);
819 &aesenc ($cmac,$rndkey1);
820 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
822 &aesenc ($inout0,$rndkey0);
823 &aesenc ($cmac,$rndkey0);
824 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
825 &jnz (&label("ccm64_dec2_loop"));
826 &movups ($in0,&QWP(0,$inp)); # load inp
827 &paddq ($ivec,&QWP(16,"esp"));
828 &aesenc ($inout0,$rndkey1);
829 &aesenc ($cmac,$rndkey1);
830 &aesenclast ($inout0,$rndkey0);
831 &aesenclast ($cmac,$rndkey0);
832 &lea ($inp,&QWP(16,$inp));
833 &jmp (&label("ccm64_dec_outer"));
835 &set_label("ccm64_dec_break",16);
836 &mov ($rounds,&DWP(240,$key_));
839 { &aesni_inline_generate1("enc",$cmac,$in0); }
841 { &call ("_aesni_encrypt1",$cmac); }
843 &mov ("esp",&DWP(48,"esp"));
844 &mov ($out,&wparam(5));
845 &movups (&QWP(0,$out),$cmac);
847 &pxor ("xmm0","xmm0"); # clear register bank
848 &pxor ("xmm1","xmm1");
849 &pxor ("xmm2","xmm2");
850 &pxor ("xmm3","xmm3");
851 &pxor ("xmm4","xmm4");
852 &pxor ("xmm5","xmm5");
853 &pxor ("xmm6","xmm6");
854 &pxor ("xmm7","xmm7");
855 &function_end("aesni_ccm64_decrypt_blocks");
858 ######################################################################
859 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
860 # size_t blocks, const AES_KEY *key,
863 # Handles only complete blocks, operates on 32-bit counter and
864 # does not update *ivec! (see crypto/modes/ctr128.c for details)
868 # 16 vector addend: 0,6,6,6
869 # 32 counter-less ivec
870 # 48 1st triplet of counter vector
871 # 64 2nd triplet of counter vector
874 &function_begin("aesni_ctr32_encrypt_blocks");
875 &mov ($inp,&wparam(0));
876 &mov ($out,&wparam(1));
877 &mov ($len,&wparam(2));
878 &mov ($key,&wparam(3));
879 &mov ($rounds_,&wparam(4));
882 &and ("esp",-16); # align stack
883 &mov (&DWP(80,"esp"),$key_);
886 &je (&label("ctr32_one_shortcut"));
888 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
890 # compose byte-swap control mask for pshufb on stack
891 &mov (&DWP(0,"esp"),0x0c0d0e0f);
892 &mov (&DWP(4,"esp"),0x08090a0b);
893 &mov (&DWP(8,"esp"),0x04050607);
894 &mov (&DWP(12,"esp"),0x00010203);
896 # compose counter increment vector on stack
899 &mov (&DWP(16,"esp"),$rounds);
900 &mov (&DWP(20,"esp"),$rounds);
901 &mov (&DWP(24,"esp"),$rounds);
902 &mov (&DWP(28,"esp"),$key_);
904 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
905 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
907 &mov ($rounds,&DWP(240,$key)); # key->rounds
909 # compose 2 vectors of 3x32-bit counters
911 &pxor ($rndkey0,$rndkey0);
912 &pxor ($rndkey1,$rndkey1);
913 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
914 &pinsrd ($rndkey0,$rounds_,0);
915 &lea ($key_,&DWP(3,$rounds_));
916 &pinsrd ($rndkey1,$key_,0);
918 &pinsrd ($rndkey0,$rounds_,1);
920 &pinsrd ($rndkey1,$key_,1);
922 &pinsrd ($rndkey0,$rounds_,2);
924 &pinsrd ($rndkey1,$key_,2);
925 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
926 &pshufb ($rndkey0,$inout0); # byte swap
927 &movdqu ($inout4,&QWP(0,$key)); # key[0]
928 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
929 &pshufb ($rndkey1,$inout0); # byte swap
931 &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword
932 &pshufd ($inout1,$rndkey0,2<<6);
934 &jb (&label("ctr32_tail"));
935 &pxor ($inout5,$inout4); # counter-less ivec^key[0]
938 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0]
939 &mov ($key_,$key); # backup $key
940 &sub ($rounds_,$rounds); # backup twisted $rounds
941 &lea ($key,&DWP(32,$key,$rounds));
943 &jmp (&label("ctr32_loop6"));
945 &set_label("ctr32_loop6",16);
946 # inlining _aesni_encrypt6's prologue gives ~6% improvement...
947 &pshufd ($inout2,$rndkey0,1<<6);
948 &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec
949 &pshufd ($inout3,$rndkey1,3<<6);
950 &pxor ($inout0,$rndkey0); # merge counter-less ivec
951 &pshufd ($inout4,$rndkey1,2<<6);
952 &pxor ($inout1,$rndkey0);
953 &pshufd ($inout5,$rndkey1,1<<6);
954 &$movekey ($rndkey1,&QWP(16,$key_));
955 &pxor ($inout2,$rndkey0);
956 &pxor ($inout3,$rndkey0);
957 &aesenc ($inout0,$rndkey1);
958 &pxor ($inout4,$rndkey0);
959 &pxor ($inout5,$rndkey0);
960 &aesenc ($inout1,$rndkey1);
961 &$movekey ($rndkey0,&QWP(32,$key_));
962 &mov ($rounds,$rounds_);
963 &aesenc ($inout2,$rndkey1);
964 &aesenc ($inout3,$rndkey1);
965 &aesenc ($inout4,$rndkey1);
966 &aesenc ($inout5,$rndkey1);
968 &call (&label("_aesni_encrypt6_enter"));
970 &movups ($rndkey1,&QWP(0,$inp));
971 &movups ($rndkey0,&QWP(0x10,$inp));
972 &xorps ($inout0,$rndkey1);
973 &movups ($rndkey1,&QWP(0x20,$inp));
974 &xorps ($inout1,$rndkey0);
975 &movups (&QWP(0,$out),$inout0);
976 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
977 &xorps ($inout2,$rndkey1);
978 &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet
979 &movups (&QWP(0x10,$out),$inout1);
980 &movups (&QWP(0x20,$out),$inout2);
982 &paddd ($rndkey1,$rndkey0); # 2nd triplet increment
983 &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment
984 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
986 &movups ($inout1,&QWP(0x30,$inp));
987 &movups ($inout2,&QWP(0x40,$inp));
988 &xorps ($inout3,$inout1);
989 &movups ($inout1,&QWP(0x50,$inp));
990 &lea ($inp,&DWP(0x60,$inp));
991 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
992 &pshufb ($rndkey0,$inout0); # byte swap
993 &xorps ($inout4,$inout2);
994 &movups (&QWP(0x30,$out),$inout3);
995 &xorps ($inout5,$inout1);
996 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
997 &pshufb ($rndkey1,$inout0); # byte swap
998 &movups (&QWP(0x40,$out),$inout4);
999 &pshufd ($inout0,$rndkey0,3<<6);
1000 &movups (&QWP(0x50,$out),$inout5);
1001 &lea ($out,&DWP(0x60,$out));
1003 &pshufd ($inout1,$rndkey0,2<<6);
1005 &jnc (&label("ctr32_loop6"));
1008 &jz (&label("ctr32_ret"));
1009 &movdqu ($inout5,&QWP(0,$key_));
1011 &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec
1012 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
1014 &set_label("ctr32_tail");
1015 &por ($inout0,$inout5);
1017 &jb (&label("ctr32_one"));
1019 &pshufd ($inout2,$rndkey0,1<<6);
1020 &por ($inout1,$inout5);
1021 &je (&label("ctr32_two"));
1023 &pshufd ($inout3,$rndkey1,3<<6);
1024 &por ($inout2,$inout5);
1026 &jb (&label("ctr32_three"));
1028 &pshufd ($inout4,$rndkey1,2<<6);
1029 &por ($inout3,$inout5);
1030 &je (&label("ctr32_four"));
1032 &por ($inout4,$inout5);
1033 &call ("_aesni_encrypt6");
1034 &movups ($rndkey1,&QWP(0,$inp));
1035 &movups ($rndkey0,&QWP(0x10,$inp));
1036 &xorps ($inout0,$rndkey1);
1037 &movups ($rndkey1,&QWP(0x20,$inp));
1038 &xorps ($inout1,$rndkey0);
1039 &movups ($rndkey0,&QWP(0x30,$inp));
1040 &xorps ($inout2,$rndkey1);
1041 &movups ($rndkey1,&QWP(0x40,$inp));
1042 &xorps ($inout3,$rndkey0);
1043 &movups (&QWP(0,$out),$inout0);
1044 &xorps ($inout4,$rndkey1);
1045 &movups (&QWP(0x10,$out),$inout1);
1046 &movups (&QWP(0x20,$out),$inout2);
1047 &movups (&QWP(0x30,$out),$inout3);
1048 &movups (&QWP(0x40,$out),$inout4);
1049 &jmp (&label("ctr32_ret"));
1051 &set_label("ctr32_one_shortcut",16);
1052 &movups ($inout0,&QWP(0,$rounds_)); # load ivec
1053 &mov ($rounds,&DWP(240,$key));
1055 &set_label("ctr32_one");
1057 { &aesni_inline_generate1("enc"); }
1059 { &call ("_aesni_encrypt1"); }
1060 &movups ($in0,&QWP(0,$inp));
1061 &xorps ($in0,$inout0);
1062 &movups (&QWP(0,$out),$in0);
1063 &jmp (&label("ctr32_ret"));
1065 &set_label("ctr32_two",16);
1066 &call ("_aesni_encrypt2");
1067 &movups ($inout3,&QWP(0,$inp));
1068 &movups ($inout4,&QWP(0x10,$inp));
1069 &xorps ($inout0,$inout3);
1070 &xorps ($inout1,$inout4);
1071 &movups (&QWP(0,$out),$inout0);
1072 &movups (&QWP(0x10,$out),$inout1);
1073 &jmp (&label("ctr32_ret"));
1075 &set_label("ctr32_three",16);
1076 &call ("_aesni_encrypt3");
1077 &movups ($inout3,&QWP(0,$inp));
1078 &movups ($inout4,&QWP(0x10,$inp));
1079 &xorps ($inout0,$inout3);
1080 &movups ($inout5,&QWP(0x20,$inp));
1081 &xorps ($inout1,$inout4);
1082 &movups (&QWP(0,$out),$inout0);
1083 &xorps ($inout2,$inout5);
1084 &movups (&QWP(0x10,$out),$inout1);
1085 &movups (&QWP(0x20,$out),$inout2);
1086 &jmp (&label("ctr32_ret"));
1088 &set_label("ctr32_four",16);
1089 &call ("_aesni_encrypt4");
1090 &movups ($inout4,&QWP(0,$inp));
1091 &movups ($inout5,&QWP(0x10,$inp));
1092 &movups ($rndkey1,&QWP(0x20,$inp));
1093 &xorps ($inout0,$inout4);
1094 &movups ($rndkey0,&QWP(0x30,$inp));
1095 &xorps ($inout1,$inout5);
1096 &movups (&QWP(0,$out),$inout0);
1097 &xorps ($inout2,$rndkey1);
1098 &movups (&QWP(0x10,$out),$inout1);
1099 &xorps ($inout3,$rndkey0);
1100 &movups (&QWP(0x20,$out),$inout2);
1101 &movups (&QWP(0x30,$out),$inout3);
1103 &set_label("ctr32_ret");
1104 &pxor ("xmm0","xmm0"); # clear register bank
1105 &pxor ("xmm1","xmm1");
1106 &pxor ("xmm2","xmm2");
1107 &pxor ("xmm3","xmm3");
1108 &pxor ("xmm4","xmm4");
1109 &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack
1110 &pxor ("xmm5","xmm5");
1111 &movdqa (&QWP(48,"esp"),"xmm0");
1112 &pxor ("xmm6","xmm6");
1113 &movdqa (&QWP(64,"esp"),"xmm0");
1114 &pxor ("xmm7","xmm7");
1115 &mov ("esp",&DWP(80,"esp"));
1116 &function_end("aesni_ctr32_encrypt_blocks");
1118 ######################################################################
1119 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1120 # const AES_KEY *key1, const AES_KEY *key2
1121 # const unsigned char iv[16]);
1123 { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1125 &function_begin("aesni_xts_encrypt");
1126 &mov ($key,&wparam(4)); # key2
1127 &mov ($inp,&wparam(5)); # clear-text tweak
1129 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1130 &movups ($inout0,&QWP(0,$inp));
1132 { &aesni_inline_generate1("enc"); }
1134 { &call ("_aesni_encrypt1"); }
1136 &mov ($inp,&wparam(0));
1137 &mov ($out,&wparam(1));
1138 &mov ($len,&wparam(2));
1139 &mov ($key,&wparam(3)); # key1
1142 &sub ("esp",16*7+8);
1143 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1144 &and ("esp",-16); # align stack
1146 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1147 &mov (&DWP(16*6+4,"esp"),0);
1148 &mov (&DWP(16*6+8,"esp"),1);
1149 &mov (&DWP(16*6+12,"esp"),0);
1150 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1151 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1153 &movdqa ($tweak,$inout0);
1154 &pxor ($twtmp,$twtmp);
1155 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1156 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1159 &mov ($key_,$key); # backup $key
1160 &mov ($rounds_,$rounds); # backup $rounds
1162 &jc (&label("xts_enc_short"));
1166 &sub ($rounds_,$rounds);
1167 &lea ($key,&DWP(32,$key,$rounds));
1168 &jmp (&label("xts_enc_loop6"));
1170 &set_label("xts_enc_loop6",16);
1171 for ($i=0;$i<4;$i++) {
1172 &pshufd ($twres,$twtmp,0x13);
1173 &pxor ($twtmp,$twtmp);
1174 &movdqa (&QWP(16*$i,"esp"),$tweak);
1175 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1176 &pand ($twres,$twmask); # isolate carry and residue
1177 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1178 &pxor ($tweak,$twres);
1180 &pshufd ($inout5,$twtmp,0x13);
1181 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1182 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1183 &$movekey ($rndkey0,&QWP(0,$key_));
1184 &pand ($inout5,$twmask); # isolate carry and residue
1185 &movups ($inout0,&QWP(0,$inp)); # load input
1186 &pxor ($inout5,$tweak);
1188 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1189 &mov ($rounds,$rounds_); # restore $rounds
1190 &movdqu ($inout1,&QWP(16*1,$inp));
1191 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1192 &movdqu ($inout2,&QWP(16*2,$inp));
1193 &pxor ($inout1,$rndkey0);
1194 &movdqu ($inout3,&QWP(16*3,$inp));
1195 &pxor ($inout2,$rndkey0);
1196 &movdqu ($inout4,&QWP(16*4,$inp));
1197 &pxor ($inout3,$rndkey0);
1198 &movdqu ($rndkey1,&QWP(16*5,$inp));
1199 &pxor ($inout4,$rndkey0);
1200 &lea ($inp,&DWP(16*6,$inp));
1201 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1202 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1203 &pxor ($inout5,$rndkey1);
1205 &$movekey ($rndkey1,&QWP(16,$key_));
1206 &pxor ($inout1,&QWP(16*1,"esp"));
1207 &pxor ($inout2,&QWP(16*2,"esp"));
1208 &aesenc ($inout0,$rndkey1);
1209 &pxor ($inout3,&QWP(16*3,"esp"));
1210 &pxor ($inout4,&QWP(16*4,"esp"));
1211 &aesenc ($inout1,$rndkey1);
1212 &pxor ($inout5,$rndkey0);
1213 &$movekey ($rndkey0,&QWP(32,$key_));
1214 &aesenc ($inout2,$rndkey1);
1215 &aesenc ($inout3,$rndkey1);
1216 &aesenc ($inout4,$rndkey1);
1217 &aesenc ($inout5,$rndkey1);
1218 &call (&label("_aesni_encrypt6_enter"));
1220 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1221 &pxor ($twtmp,$twtmp);
1222 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1223 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1224 &xorps ($inout1,&QWP(16*1,"esp"));
1225 &movups (&QWP(16*0,$out),$inout0); # write output
1226 &xorps ($inout2,&QWP(16*2,"esp"));
1227 &movups (&QWP(16*1,$out),$inout1);
1228 &xorps ($inout3,&QWP(16*3,"esp"));
1229 &movups (&QWP(16*2,$out),$inout2);
1230 &xorps ($inout4,&QWP(16*4,"esp"));
1231 &movups (&QWP(16*3,$out),$inout3);
1232 &xorps ($inout5,$tweak);
1233 &movups (&QWP(16*4,$out),$inout4);
1234 &pshufd ($twres,$twtmp,0x13);
1235 &movups (&QWP(16*5,$out),$inout5);
1236 &lea ($out,&DWP(16*6,$out));
1237 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1239 &pxor ($twtmp,$twtmp);
1240 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1241 &pand ($twres,$twmask); # isolate carry and residue
1242 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1243 &pxor ($tweak,$twres);
1246 &jnc (&label("xts_enc_loop6"));
1248 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
1249 &mov ($key,$key_); # restore $key
1250 &mov ($rounds_,$rounds);
1252 &set_label("xts_enc_short");
1254 &jz (&label("xts_enc_done6x"));
1256 &movdqa ($inout3,$tweak); # put aside previous tweak
1258 &jb (&label("xts_enc_one"));
1260 &pshufd ($twres,$twtmp,0x13);
1261 &pxor ($twtmp,$twtmp);
1262 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1263 &pand ($twres,$twmask); # isolate carry and residue
1264 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1265 &pxor ($tweak,$twres);
1266 &je (&label("xts_enc_two"));
1268 &pshufd ($twres,$twtmp,0x13);
1269 &pxor ($twtmp,$twtmp);
1270 &movdqa ($inout4,$tweak); # put aside previous tweak
1271 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1272 &pand ($twres,$twmask); # isolate carry and residue
1273 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1274 &pxor ($tweak,$twres);
1276 &jb (&label("xts_enc_three"));
1278 &pshufd ($twres,$twtmp,0x13);
1279 &pxor ($twtmp,$twtmp);
1280 &movdqa ($inout5,$tweak); # put aside previous tweak
1281 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1282 &pand ($twres,$twmask); # isolate carry and residue
1283 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1284 &pxor ($tweak,$twres);
1285 &movdqa (&QWP(16*0,"esp"),$inout3);
1286 &movdqa (&QWP(16*1,"esp"),$inout4);
1287 &je (&label("xts_enc_four"));
1289 &movdqa (&QWP(16*2,"esp"),$inout5);
1290 &pshufd ($inout5,$twtmp,0x13);
1291 &movdqa (&QWP(16*3,"esp"),$tweak);
1292 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1293 &pand ($inout5,$twmask); # isolate carry and residue
1294 &pxor ($inout5,$tweak);
1296 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1297 &movdqu ($inout1,&QWP(16*1,$inp));
1298 &movdqu ($inout2,&QWP(16*2,$inp));
1299 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1300 &movdqu ($inout3,&QWP(16*3,$inp));
1301 &pxor ($inout1,&QWP(16*1,"esp"));
1302 &movdqu ($inout4,&QWP(16*4,$inp));
1303 &pxor ($inout2,&QWP(16*2,"esp"));
1304 &lea ($inp,&DWP(16*5,$inp));
1305 &pxor ($inout3,&QWP(16*3,"esp"));
1306 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1307 &pxor ($inout4,$inout5);
1309 &call ("_aesni_encrypt6");
1311 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1312 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1313 &xorps ($inout1,&QWP(16*1,"esp"));
1314 &xorps ($inout2,&QWP(16*2,"esp"));
1315 &movups (&QWP(16*0,$out),$inout0); # write output
1316 &xorps ($inout3,&QWP(16*3,"esp"));
1317 &movups (&QWP(16*1,$out),$inout1);
1318 &xorps ($inout4,$tweak);
1319 &movups (&QWP(16*2,$out),$inout2);
1320 &movups (&QWP(16*3,$out),$inout3);
1321 &movups (&QWP(16*4,$out),$inout4);
1322 &lea ($out,&DWP(16*5,$out));
1323 &jmp (&label("xts_enc_done"));
1325 &set_label("xts_enc_one",16);
1326 &movups ($inout0,&QWP(16*0,$inp)); # load input
1327 &lea ($inp,&DWP(16*1,$inp));
1328 &xorps ($inout0,$inout3); # input^=tweak
1330 { &aesni_inline_generate1("enc"); }
1332 { &call ("_aesni_encrypt1"); }
1333 &xorps ($inout0,$inout3); # output^=tweak
1334 &movups (&QWP(16*0,$out),$inout0); # write output
1335 &lea ($out,&DWP(16*1,$out));
1337 &movdqa ($tweak,$inout3); # last tweak
1338 &jmp (&label("xts_enc_done"));
1340 &set_label("xts_enc_two",16);
1341 &movaps ($inout4,$tweak); # put aside last tweak
1343 &movups ($inout0,&QWP(16*0,$inp)); # load input
1344 &movups ($inout1,&QWP(16*1,$inp));
1345 &lea ($inp,&DWP(16*2,$inp));
1346 &xorps ($inout0,$inout3); # input^=tweak
1347 &xorps ($inout1,$inout4);
1349 &call ("_aesni_encrypt2");
1351 &xorps ($inout0,$inout3); # output^=tweak
1352 &xorps ($inout1,$inout4);
1353 &movups (&QWP(16*0,$out),$inout0); # write output
1354 &movups (&QWP(16*1,$out),$inout1);
1355 &lea ($out,&DWP(16*2,$out));
1357 &movdqa ($tweak,$inout4); # last tweak
1358 &jmp (&label("xts_enc_done"));
1360 &set_label("xts_enc_three",16);
1361 &movaps ($inout5,$tweak); # put aside last tweak
1362 &movups ($inout0,&QWP(16*0,$inp)); # load input
1363 &movups ($inout1,&QWP(16*1,$inp));
1364 &movups ($inout2,&QWP(16*2,$inp));
1365 &lea ($inp,&DWP(16*3,$inp));
1366 &xorps ($inout0,$inout3); # input^=tweak
1367 &xorps ($inout1,$inout4);
1368 &xorps ($inout2,$inout5);
1370 &call ("_aesni_encrypt3");
1372 &xorps ($inout0,$inout3); # output^=tweak
1373 &xorps ($inout1,$inout4);
1374 &xorps ($inout2,$inout5);
1375 &movups (&QWP(16*0,$out),$inout0); # write output
1376 &movups (&QWP(16*1,$out),$inout1);
1377 &movups (&QWP(16*2,$out),$inout2);
1378 &lea ($out,&DWP(16*3,$out));
1380 &movdqa ($tweak,$inout5); # last tweak
1381 &jmp (&label("xts_enc_done"));
1383 &set_label("xts_enc_four",16);
1384 &movaps ($inout4,$tweak); # put aside last tweak
1386 &movups ($inout0,&QWP(16*0,$inp)); # load input
1387 &movups ($inout1,&QWP(16*1,$inp));
1388 &movups ($inout2,&QWP(16*2,$inp));
1389 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1390 &movups ($inout3,&QWP(16*3,$inp));
1391 &lea ($inp,&DWP(16*4,$inp));
1392 &xorps ($inout1,&QWP(16*1,"esp"));
1393 &xorps ($inout2,$inout5);
1394 &xorps ($inout3,$inout4);
1396 &call ("_aesni_encrypt4");
1398 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1399 &xorps ($inout1,&QWP(16*1,"esp"));
1400 &xorps ($inout2,$inout5);
1401 &movups (&QWP(16*0,$out),$inout0); # write output
1402 &xorps ($inout3,$inout4);
1403 &movups (&QWP(16*1,$out),$inout1);
1404 &movups (&QWP(16*2,$out),$inout2);
1405 &movups (&QWP(16*3,$out),$inout3);
1406 &lea ($out,&DWP(16*4,$out));
1408 &movdqa ($tweak,$inout4); # last tweak
1409 &jmp (&label("xts_enc_done"));
1411 &set_label("xts_enc_done6x",16); # $tweak is pre-calculated
1412 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1414 &jz (&label("xts_enc_ret"));
1415 &movdqa ($inout3,$tweak);
1416 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1417 &jmp (&label("xts_enc_steal"));
1419 &set_label("xts_enc_done",16);
1420 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1421 &pxor ($twtmp,$twtmp);
1423 &jz (&label("xts_enc_ret"));
1425 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1426 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1427 &pshufd ($inout3,$twtmp,0x13);
1428 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1429 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
1430 &pxor ($inout3,$tweak);
1432 &set_label("xts_enc_steal");
1433 &movz ($rounds,&BP(0,$inp));
1434 &movz ($key,&BP(-16,$out));
1435 &lea ($inp,&DWP(1,$inp));
1436 &mov (&BP(-16,$out),&LB($rounds));
1437 &mov (&BP(0,$out),&LB($key));
1438 &lea ($out,&DWP(1,$out));
1440 &jnz (&label("xts_enc_steal"));
1442 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1443 &mov ($key,$key_); # restore $key
1444 &mov ($rounds,$rounds_); # restore $rounds
1446 &movups ($inout0,&QWP(-16,$out)); # load input
1447 &xorps ($inout0,$inout3); # input^=tweak
1449 { &aesni_inline_generate1("enc"); }
1451 { &call ("_aesni_encrypt1"); }
1452 &xorps ($inout0,$inout3); # output^=tweak
1453 &movups (&QWP(-16,$out),$inout0); # write output
1455 &set_label("xts_enc_ret");
1456 &pxor ("xmm0","xmm0"); # clear register bank
1457 &pxor ("xmm1","xmm1");
1458 &pxor ("xmm2","xmm2");
1459 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
1460 &pxor ("xmm3","xmm3");
1461 &movdqa (&QWP(16*1,"esp"),"xmm0");
1462 &pxor ("xmm4","xmm4");
1463 &movdqa (&QWP(16*2,"esp"),"xmm0");
1464 &pxor ("xmm5","xmm5");
1465 &movdqa (&QWP(16*3,"esp"),"xmm0");
1466 &pxor ("xmm6","xmm6");
1467 &movdqa (&QWP(16*4,"esp"),"xmm0");
1468 &pxor ("xmm7","xmm7");
1469 &movdqa (&QWP(16*5,"esp"),"xmm0");
1470 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1471 &function_end("aesni_xts_encrypt");
1473 &function_begin("aesni_xts_decrypt");
1474 &mov ($key,&wparam(4)); # key2
1475 &mov ($inp,&wparam(5)); # clear-text tweak
1477 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1478 &movups ($inout0,&QWP(0,$inp));
1480 { &aesni_inline_generate1("enc"); }
1482 { &call ("_aesni_encrypt1"); }
1484 &mov ($inp,&wparam(0));
1485 &mov ($out,&wparam(1));
1486 &mov ($len,&wparam(2));
1487 &mov ($key,&wparam(3)); # key1
1490 &sub ("esp",16*7+8);
1491 &and ("esp",-16); # align stack
1493 &xor ($rounds_,$rounds_); # if(len%16) len-=16;
1495 &setnz (&LB($rounds_));
1497 &sub ($len,$rounds_);
1499 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1500 &mov (&DWP(16*6+4,"esp"),0);
1501 &mov (&DWP(16*6+8,"esp"),1);
1502 &mov (&DWP(16*6+12,"esp"),0);
1503 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1504 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1506 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1507 &mov ($key_,$key); # backup $key
1508 &mov ($rounds_,$rounds); # backup $rounds
1510 &movdqa ($tweak,$inout0);
1511 &pxor ($twtmp,$twtmp);
1512 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1513 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1517 &jc (&label("xts_dec_short"));
1521 &sub ($rounds_,$rounds);
1522 &lea ($key,&DWP(32,$key,$rounds));
1523 &jmp (&label("xts_dec_loop6"));
1525 &set_label("xts_dec_loop6",16);
1526 for ($i=0;$i<4;$i++) {
1527 &pshufd ($twres,$twtmp,0x13);
1528 &pxor ($twtmp,$twtmp);
1529 &movdqa (&QWP(16*$i,"esp"),$tweak);
1530 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1531 &pand ($twres,$twmask); # isolate carry and residue
1532 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1533 &pxor ($tweak,$twres);
1535 &pshufd ($inout5,$twtmp,0x13);
1536 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1537 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1538 &$movekey ($rndkey0,&QWP(0,$key_));
1539 &pand ($inout5,$twmask); # isolate carry and residue
1540 &movups ($inout0,&QWP(0,$inp)); # load input
1541 &pxor ($inout5,$tweak);
1543 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1544 &mov ($rounds,$rounds_);
1545 &movdqu ($inout1,&QWP(16*1,$inp));
1546 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1547 &movdqu ($inout2,&QWP(16*2,$inp));
1548 &pxor ($inout1,$rndkey0);
1549 &movdqu ($inout3,&QWP(16*3,$inp));
1550 &pxor ($inout2,$rndkey0);
1551 &movdqu ($inout4,&QWP(16*4,$inp));
1552 &pxor ($inout3,$rndkey0);
1553 &movdqu ($rndkey1,&QWP(16*5,$inp));
1554 &pxor ($inout4,$rndkey0);
1555 &lea ($inp,&DWP(16*6,$inp));
1556 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1557 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1558 &pxor ($inout5,$rndkey1);
1560 &$movekey ($rndkey1,&QWP(16,$key_));
1561 &pxor ($inout1,&QWP(16*1,"esp"));
1562 &pxor ($inout2,&QWP(16*2,"esp"));
1563 &aesdec ($inout0,$rndkey1);
1564 &pxor ($inout3,&QWP(16*3,"esp"));
1565 &pxor ($inout4,&QWP(16*4,"esp"));
1566 &aesdec ($inout1,$rndkey1);
1567 &pxor ($inout5,$rndkey0);
1568 &$movekey ($rndkey0,&QWP(32,$key_));
1569 &aesdec ($inout2,$rndkey1);
1570 &aesdec ($inout3,$rndkey1);
1571 &aesdec ($inout4,$rndkey1);
1572 &aesdec ($inout5,$rndkey1);
1573 &call (&label("_aesni_decrypt6_enter"));
1575 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1576 &pxor ($twtmp,$twtmp);
1577 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1578 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1579 &xorps ($inout1,&QWP(16*1,"esp"));
1580 &movups (&QWP(16*0,$out),$inout0); # write output
1581 &xorps ($inout2,&QWP(16*2,"esp"));
1582 &movups (&QWP(16*1,$out),$inout1);
1583 &xorps ($inout3,&QWP(16*3,"esp"));
1584 &movups (&QWP(16*2,$out),$inout2);
1585 &xorps ($inout4,&QWP(16*4,"esp"));
1586 &movups (&QWP(16*3,$out),$inout3);
1587 &xorps ($inout5,$tweak);
1588 &movups (&QWP(16*4,$out),$inout4);
1589 &pshufd ($twres,$twtmp,0x13);
1590 &movups (&QWP(16*5,$out),$inout5);
1591 &lea ($out,&DWP(16*6,$out));
1592 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1594 &pxor ($twtmp,$twtmp);
1595 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1596 &pand ($twres,$twmask); # isolate carry and residue
1597 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1598 &pxor ($tweak,$twres);
1601 &jnc (&label("xts_dec_loop6"));
1603 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
1604 &mov ($key,$key_); # restore $key
1605 &mov ($rounds_,$rounds);
1607 &set_label("xts_dec_short");
1609 &jz (&label("xts_dec_done6x"));
1611 &movdqa ($inout3,$tweak); # put aside previous tweak
1613 &jb (&label("xts_dec_one"));
1615 &pshufd ($twres,$twtmp,0x13);
1616 &pxor ($twtmp,$twtmp);
1617 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1618 &pand ($twres,$twmask); # isolate carry and residue
1619 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1620 &pxor ($tweak,$twres);
1621 &je (&label("xts_dec_two"));
1623 &pshufd ($twres,$twtmp,0x13);
1624 &pxor ($twtmp,$twtmp);
1625 &movdqa ($inout4,$tweak); # put aside previous tweak
1626 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1627 &pand ($twres,$twmask); # isolate carry and residue
1628 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1629 &pxor ($tweak,$twres);
1631 &jb (&label("xts_dec_three"));
1633 &pshufd ($twres,$twtmp,0x13);
1634 &pxor ($twtmp,$twtmp);
1635 &movdqa ($inout5,$tweak); # put aside previous tweak
1636 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1637 &pand ($twres,$twmask); # isolate carry and residue
1638 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1639 &pxor ($tweak,$twres);
1640 &movdqa (&QWP(16*0,"esp"),$inout3);
1641 &movdqa (&QWP(16*1,"esp"),$inout4);
1642 &je (&label("xts_dec_four"));
1644 &movdqa (&QWP(16*2,"esp"),$inout5);
1645 &pshufd ($inout5,$twtmp,0x13);
1646 &movdqa (&QWP(16*3,"esp"),$tweak);
1647 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1648 &pand ($inout5,$twmask); # isolate carry and residue
1649 &pxor ($inout5,$tweak);
1651 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1652 &movdqu ($inout1,&QWP(16*1,$inp));
1653 &movdqu ($inout2,&QWP(16*2,$inp));
1654 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1655 &movdqu ($inout3,&QWP(16*3,$inp));
1656 &pxor ($inout1,&QWP(16*1,"esp"));
1657 &movdqu ($inout4,&QWP(16*4,$inp));
1658 &pxor ($inout2,&QWP(16*2,"esp"));
1659 &lea ($inp,&DWP(16*5,$inp));
1660 &pxor ($inout3,&QWP(16*3,"esp"));
1661 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1662 &pxor ($inout4,$inout5);
1664 &call ("_aesni_decrypt6");
1666 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1667 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1668 &xorps ($inout1,&QWP(16*1,"esp"));
1669 &xorps ($inout2,&QWP(16*2,"esp"));
1670 &movups (&QWP(16*0,$out),$inout0); # write output
1671 &xorps ($inout3,&QWP(16*3,"esp"));
1672 &movups (&QWP(16*1,$out),$inout1);
1673 &xorps ($inout4,$tweak);
1674 &movups (&QWP(16*2,$out),$inout2);
1675 &movups (&QWP(16*3,$out),$inout3);
1676 &movups (&QWP(16*4,$out),$inout4);
1677 &lea ($out,&DWP(16*5,$out));
1678 &jmp (&label("xts_dec_done"));
1680 &set_label("xts_dec_one",16);
1681 &movups ($inout0,&QWP(16*0,$inp)); # load input
1682 &lea ($inp,&DWP(16*1,$inp));
1683 &xorps ($inout0,$inout3); # input^=tweak
1685 { &aesni_inline_generate1("dec"); }
1687 { &call ("_aesni_decrypt1"); }
1688 &xorps ($inout0,$inout3); # output^=tweak
1689 &movups (&QWP(16*0,$out),$inout0); # write output
1690 &lea ($out,&DWP(16*1,$out));
1692 &movdqa ($tweak,$inout3); # last tweak
1693 &jmp (&label("xts_dec_done"));
1695 &set_label("xts_dec_two",16);
1696 &movaps ($inout4,$tweak); # put aside last tweak
1698 &movups ($inout0,&QWP(16*0,$inp)); # load input
1699 &movups ($inout1,&QWP(16*1,$inp));
1700 &lea ($inp,&DWP(16*2,$inp));
1701 &xorps ($inout0,$inout3); # input^=tweak
1702 &xorps ($inout1,$inout4);
1704 &call ("_aesni_decrypt2");
1706 &xorps ($inout0,$inout3); # output^=tweak
1707 &xorps ($inout1,$inout4);
1708 &movups (&QWP(16*0,$out),$inout0); # write output
1709 &movups (&QWP(16*1,$out),$inout1);
1710 &lea ($out,&DWP(16*2,$out));
1712 &movdqa ($tweak,$inout4); # last tweak
1713 &jmp (&label("xts_dec_done"));
1715 &set_label("xts_dec_three",16);
1716 &movaps ($inout5,$tweak); # put aside last tweak
1717 &movups ($inout0,&QWP(16*0,$inp)); # load input
1718 &movups ($inout1,&QWP(16*1,$inp));
1719 &movups ($inout2,&QWP(16*2,$inp));
1720 &lea ($inp,&DWP(16*3,$inp));
1721 &xorps ($inout0,$inout3); # input^=tweak
1722 &xorps ($inout1,$inout4);
1723 &xorps ($inout2,$inout5);
1725 &call ("_aesni_decrypt3");
1727 &xorps ($inout0,$inout3); # output^=tweak
1728 &xorps ($inout1,$inout4);
1729 &xorps ($inout2,$inout5);
1730 &movups (&QWP(16*0,$out),$inout0); # write output
1731 &movups (&QWP(16*1,$out),$inout1);
1732 &movups (&QWP(16*2,$out),$inout2);
1733 &lea ($out,&DWP(16*3,$out));
1735 &movdqa ($tweak,$inout5); # last tweak
1736 &jmp (&label("xts_dec_done"));
1738 &set_label("xts_dec_four",16);
1739 &movaps ($inout4,$tweak); # put aside last tweak
1741 &movups ($inout0,&QWP(16*0,$inp)); # load input
1742 &movups ($inout1,&QWP(16*1,$inp));
1743 &movups ($inout2,&QWP(16*2,$inp));
1744 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1745 &movups ($inout3,&QWP(16*3,$inp));
1746 &lea ($inp,&DWP(16*4,$inp));
1747 &xorps ($inout1,&QWP(16*1,"esp"));
1748 &xorps ($inout2,$inout5);
1749 &xorps ($inout3,$inout4);
1751 &call ("_aesni_decrypt4");
1753 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1754 &xorps ($inout1,&QWP(16*1,"esp"));
1755 &xorps ($inout2,$inout5);
1756 &movups (&QWP(16*0,$out),$inout0); # write output
1757 &xorps ($inout3,$inout4);
1758 &movups (&QWP(16*1,$out),$inout1);
1759 &movups (&QWP(16*2,$out),$inout2);
1760 &movups (&QWP(16*3,$out),$inout3);
1761 &lea ($out,&DWP(16*4,$out));
1763 &movdqa ($tweak,$inout4); # last tweak
1764 &jmp (&label("xts_dec_done"));
1766 &set_label("xts_dec_done6x",16); # $tweak is pre-calculated
1767 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1769 &jz (&label("xts_dec_ret"));
1770 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1771 &jmp (&label("xts_dec_only_one_more"));
1773 &set_label("xts_dec_done",16);
1774 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1775 &pxor ($twtmp,$twtmp);
1777 &jz (&label("xts_dec_ret"));
1779 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1780 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1781 &pshufd ($twres,$twtmp,0x13);
1782 &pxor ($twtmp,$twtmp);
1783 &movdqa ($twmask,&QWP(16*6,"esp"));
1784 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1785 &pand ($twres,$twmask); # isolate carry and residue
1786 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1787 &pxor ($tweak,$twres);
1789 &set_label("xts_dec_only_one_more");
1790 &pshufd ($inout3,$twtmp,0x13);
1791 &movdqa ($inout4,$tweak); # put aside previous tweak
1792 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1793 &pand ($inout3,$twmask); # isolate carry and residue
1794 &pxor ($inout3,$tweak);
1796 &mov ($key,$key_); # restore $key
1797 &mov ($rounds,$rounds_); # restore $rounds
1799 &movups ($inout0,&QWP(0,$inp)); # load input
1800 &xorps ($inout0,$inout3); # input^=tweak
1802 { &aesni_inline_generate1("dec"); }
1804 { &call ("_aesni_decrypt1"); }
1805 &xorps ($inout0,$inout3); # output^=tweak
1806 &movups (&QWP(0,$out),$inout0); # write output
1808 &set_label("xts_dec_steal");
1809 &movz ($rounds,&BP(16,$inp));
1810 &movz ($key,&BP(0,$out));
1811 &lea ($inp,&DWP(1,$inp));
1812 &mov (&BP(0,$out),&LB($rounds));
1813 &mov (&BP(16,$out),&LB($key));
1814 &lea ($out,&DWP(1,$out));
1816 &jnz (&label("xts_dec_steal"));
1818 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1819 &mov ($key,$key_); # restore $key
1820 &mov ($rounds,$rounds_); # restore $rounds
1822 &movups ($inout0,&QWP(0,$out)); # load input
1823 &xorps ($inout0,$inout4); # input^=tweak
1825 { &aesni_inline_generate1("dec"); }
1827 { &call ("_aesni_decrypt1"); }
1828 &xorps ($inout0,$inout4); # output^=tweak
1829 &movups (&QWP(0,$out),$inout0); # write output
1831 &set_label("xts_dec_ret");
1832 &pxor ("xmm0","xmm0"); # clear register bank
1833 &pxor ("xmm1","xmm1");
1834 &pxor ("xmm2","xmm2");
1835 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
1836 &pxor ("xmm3","xmm3");
1837 &movdqa (&QWP(16*1,"esp"),"xmm0");
1838 &pxor ("xmm4","xmm4");
1839 &movdqa (&QWP(16*2,"esp"),"xmm0");
1840 &pxor ("xmm5","xmm5");
1841 &movdqa (&QWP(16*3,"esp"),"xmm0");
1842 &pxor ("xmm6","xmm6");
1843 &movdqa (&QWP(16*4,"esp"),"xmm0");
1844 &pxor ("xmm7","xmm7");
1845 &movdqa (&QWP(16*5,"esp"),"xmm0");
1846 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1847 &function_end("aesni_xts_decrypt");
1850 ######################################################################
1851 # void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
1852 # const AES_KEY *key, unsigned int start_block_num,
1853 # unsigned char offset_i[16], const unsigned char L_[][16],
1854 # unsigned char checksum[16]);
1857 # offsets within stack frame
1858 my $checksum = 16*6;
1859 my ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4));
1861 # reassigned registers
1862 my ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out);
1863 # $l_, $blocks, $inp, $key are permanently allocated in registers;
1864 # remaining non-volatile ones are offloaded to stack, which even
1865 # stay invariant after written to stack.
1867 &function_begin("aesni_ocb_encrypt");
1868 &mov ($rounds,&wparam(5)); # &offset_i
1869 &mov ($rounds_,&wparam(7)); # &checksum
1871 &mov ($inp,&wparam(0));
1872 &mov ($out,&wparam(1));
1873 &mov ($len,&wparam(2));
1874 &mov ($key,&wparam(3));
1875 &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i
1876 &mov ($block,&wparam(4)); # start_block_num
1877 &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum
1878 &mov ($l_,&wparam(6)); # L_
1880 &mov ($rounds,"esp");
1881 &sub ("esp",$esp_off+4); # alloca
1882 &and ("esp",-16); # align stack
1886 &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6
1887 &mov (&DWP($out_off,"esp"),$out);
1888 &mov (&DWP($end_off,"esp"),$len);
1889 &mov (&DWP($esp_off,"esp"),$rounds);
1891 &mov ($rounds,&DWP(240,$key));
1894 &jnz (&label("odd"));
1899 &movdqu ($inout5,&QWP(0,$l_,$i3));
1900 &mov ($i3,$key); # put aside key
1902 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1903 &lea ($inp,&DWP(16,$inp));
1905 &pxor ($inout5,$rndkey0); # ^ last offset_i
1906 &pxor ($rndkey1,$inout0); # checksum
1907 &pxor ($inout0,$inout5); # ^ offset_i
1909 &movdqa ($inout4,$rndkey1);
1911 { &aesni_inline_generate1("enc"); }
1913 { &call ("_aesni_encrypt1"); }
1915 &xorps ($inout0,$inout5); # ^ offset_i
1916 &movdqa ($rndkey0,$inout5); # pass last offset_i
1917 &movdqa ($rndkey1,$inout4); # pass the checksum
1919 &movups (&QWP(-16,$out,$inp),$inout0); # store output
1921 &mov ($rounds,&DWP(240,$i3));
1922 &mov ($key,$i3); # restore key
1923 &mov ($len,&DWP($end_off,"esp"));
1928 &sub ($out,$rounds); # twisted rounds
1929 &mov (&DWP($key_off,"esp"),$key);
1930 &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule
1931 &mov (&DWP($rounds_off,"esp"),$out);
1934 &ja (&label("short"));
1935 &jmp (&label("grandloop"));
1937 &set_label("grandloop",32);
1938 &lea ($i1,&DWP(1,$block));
1939 &lea ($i3,&DWP(3,$block));
1940 &lea ($i5,&DWP(5,$block));
1948 &movdqu ($inout0,&QWP(0,$l_));
1949 &movdqu ($inout1,&QWP(0,$l_,$i1));
1950 &mov ($rounds,&DWP($rounds_off,"esp"));
1951 &movdqa ($inout2,$inout0);
1952 &movdqu ($inout3,&QWP(0,$l_,$i3));
1953 &movdqa ($inout4,$inout0);
1954 &movdqu ($inout5,&QWP(0,$l_,$i5));
1956 &pxor ($inout0,$rndkey0); # ^ last offset_i
1957 &pxor ($inout1,$inout0);
1958 &movdqa (&QWP(16*0,"esp"),$inout0);
1959 &pxor ($inout2,$inout1);
1960 &movdqa (&QWP(16*1,"esp"),$inout1);
1961 &pxor ($inout3,$inout2);
1962 &movdqa (&QWP(16*2,"esp"),$inout2);
1963 &pxor ($inout4,$inout3);
1964 &movdqa (&QWP(16*3,"esp"),$inout3);
1965 &pxor ($inout5,$inout4);
1966 &movdqa (&QWP(16*4,"esp"),$inout4);
1967 &movdqa (&QWP(16*5,"esp"),$inout5);
1969 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
1970 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1971 &movdqu ($inout1,&QWP(16*1,$inp));
1972 &movdqu ($inout2,&QWP(16*2,$inp));
1973 &movdqu ($inout3,&QWP(16*3,$inp));
1974 &movdqu ($inout4,&QWP(16*4,$inp));
1975 &movdqu ($inout5,&QWP(16*5,$inp));
1976 &lea ($inp,&DWP(16*6,$inp));
1978 &pxor ($rndkey1,$inout0); # checksum
1979 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
1980 &pxor ($rndkey1,$inout1);
1981 &pxor ($inout1,$rndkey0);
1982 &pxor ($rndkey1,$inout2);
1983 &pxor ($inout2,$rndkey0);
1984 &pxor ($rndkey1,$inout3);
1985 &pxor ($inout3,$rndkey0);
1986 &pxor ($rndkey1,$inout4);
1987 &pxor ($inout4,$rndkey0);
1988 &pxor ($rndkey1,$inout5);
1989 &pxor ($inout5,$rndkey0);
1990 &movdqa (&QWP($checksum,"esp"),$rndkey1);
1992 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
1993 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
1994 &pxor ($inout1,&QWP(16*1,"esp"));
1995 &pxor ($inout2,&QWP(16*2,"esp"));
1996 &pxor ($inout3,&QWP(16*3,"esp"));
1997 &pxor ($inout4,&QWP(16*4,"esp"));
1998 &pxor ($inout5,&QWP(16*5,"esp"));
2000 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2001 &aesenc ($inout0,$rndkey1);
2002 &aesenc ($inout1,$rndkey1);
2003 &aesenc ($inout2,$rndkey1);
2004 &aesenc ($inout3,$rndkey1);
2005 &aesenc ($inout4,$rndkey1);
2006 &aesenc ($inout5,$rndkey1);
2008 &mov ($out,&DWP($out_off,"esp"));
2009 &mov ($len,&DWP($end_off,"esp"));
2010 &call ("_aesni_encrypt6_enter");
2012 &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i
2013 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2014 &pxor ($inout1,&QWP(16*1,"esp"));
2015 &pxor ($inout2,&QWP(16*2,"esp"));
2016 &pxor ($inout3,&QWP(16*3,"esp"));
2017 &pxor ($inout4,&QWP(16*4,"esp"));
2018 &pxor ($inout5,$rndkey0);
2019 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2021 &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output
2022 &movdqu (&QWP(-16*5,$out,$inp),$inout1);
2023 &movdqu (&QWP(-16*4,$out,$inp),$inout2);
2024 &movdqu (&QWP(-16*3,$out,$inp),$inout3);
2025 &movdqu (&QWP(-16*2,$out,$inp),$inout4);
2026 &movdqu (&QWP(-16*1,$out,$inp),$inout5);
2027 &cmp ($inp,$len); # done yet?
2028 &jb (&label("grandloop"));
2030 &set_label("short");
2033 &jz (&label("done"));
2036 &jb (&label("one"));
2037 &je (&label("two"));
2040 &jb (&label("three"));
2041 &je (&label("four"));
2043 &lea ($i1,&DWP(1,$block));
2044 &lea ($i3,&DWP(3,$block));
2049 &movdqu ($inout0,&QWP(0,$l_));
2050 &movdqu ($inout1,&QWP(0,$l_,$i1));
2051 &mov ($rounds,&DWP($rounds_off,"esp"));
2052 &movdqa ($inout2,$inout0);
2053 &movdqu ($inout3,&QWP(0,$l_,$i3));
2054 &movdqa ($inout4,$inout0);
2056 &pxor ($inout0,$rndkey0); # ^ last offset_i
2057 &pxor ($inout1,$inout0);
2058 &movdqa (&QWP(16*0,"esp"),$inout0);
2059 &pxor ($inout2,$inout1);
2060 &movdqa (&QWP(16*1,"esp"),$inout1);
2061 &pxor ($inout3,$inout2);
2062 &movdqa (&QWP(16*2,"esp"),$inout2);
2063 &pxor ($inout4,$inout3);
2064 &movdqa (&QWP(16*3,"esp"),$inout3);
2065 &pxor ($inout5,$inout4);
2066 &movdqa (&QWP(16*4,"esp"),$inout4);
2068 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
2069 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2070 &movdqu ($inout1,&QWP(16*1,$inp));
2071 &movdqu ($inout2,&QWP(16*2,$inp));
2072 &movdqu ($inout3,&QWP(16*3,$inp));
2073 &movdqu ($inout4,&QWP(16*4,$inp));
2074 &pxor ($inout5,$inout5);
2076 &pxor ($rndkey1,$inout0); # checksum
2077 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
2078 &pxor ($rndkey1,$inout1);
2079 &pxor ($inout1,$rndkey0);
2080 &pxor ($rndkey1,$inout2);
2081 &pxor ($inout2,$rndkey0);
2082 &pxor ($rndkey1,$inout3);
2083 &pxor ($inout3,$rndkey0);
2084 &pxor ($rndkey1,$inout4);
2085 &pxor ($inout4,$rndkey0);
2086 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2088 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
2089 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2090 &pxor ($inout1,&QWP(16*1,"esp"));
2091 &pxor ($inout2,&QWP(16*2,"esp"));
2092 &pxor ($inout3,&QWP(16*3,"esp"));
2093 &pxor ($inout4,&QWP(16*4,"esp"));
2095 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2096 &aesenc ($inout0,$rndkey1);
2097 &aesenc ($inout1,$rndkey1);
2098 &aesenc ($inout2,$rndkey1);
2099 &aesenc ($inout3,$rndkey1);
2100 &aesenc ($inout4,$rndkey1);
2101 &aesenc ($inout5,$rndkey1);
2103 &mov ($out,&DWP($out_off,"esp"));
2104 &call ("_aesni_encrypt6_enter");
2106 &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i
2107 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2108 &pxor ($inout1,&QWP(16*1,"esp"));
2109 &pxor ($inout2,&QWP(16*2,"esp"));
2110 &pxor ($inout3,&QWP(16*3,"esp"));
2111 &pxor ($inout4,$rndkey0);
2112 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2114 &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output
2115 &movdqu (&QWP(16*1,$out,$inp),$inout1);
2116 &movdqu (&QWP(16*2,$out,$inp),$inout2);
2117 &movdqu (&QWP(16*3,$out,$inp),$inout3);
2118 &movdqu (&QWP(16*4,$out,$inp),$inout4);
2120 &jmp (&label("done"));
2122 &set_label("one",16);
2123 &movdqu ($inout5,&QWP(0,$l_));
2124 &mov ($key,&DWP($key_off,"esp")); # restore key
2126 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2127 &mov ($rounds,&DWP(240,$key));
2129 &pxor ($inout5,$rndkey0); # ^ last offset_i
2130 &pxor ($rndkey1,$inout0); # checksum
2131 &pxor ($inout0,$inout5); # ^ offset_i
2133 &movdqa ($inout4,$rndkey1);
2134 &mov ($out,&DWP($out_off,"esp"));
2136 { &aesni_inline_generate1("enc"); }
2138 { &call ("_aesni_encrypt1"); }
2140 &xorps ($inout0,$inout5); # ^ offset_i
2141 &movdqa ($rndkey0,$inout5); # pass last offset_i
2142 &movdqa ($rndkey1,$inout4); # pass the checksum
2143 &movups (&QWP(0,$out,$inp),$inout0);
2145 &jmp (&label("done"));
2147 &set_label("two",16);
2148 &lea ($i1,&DWP(1,$block));
2149 &mov ($key,&DWP($key_off,"esp")); # restore key
2152 &movdqu ($inout4,&QWP(0,$l_));
2153 &movdqu ($inout5,&QWP(0,$l_,$i1));
2155 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2156 &movdqu ($inout1,&QWP(16*1,$inp));
2157 &mov ($rounds,&DWP(240,$key));
2159 &pxor ($inout4,$rndkey0); # ^ last offset_i
2160 &pxor ($inout5,$inout4);
2162 &pxor ($rndkey1,$inout0); # checksum
2163 &pxor ($inout0,$inout4); # ^ offset_i
2164 &pxor ($rndkey1,$inout1);
2165 &pxor ($inout1,$inout5);
2167 &movdqa ($inout3,$rndkey1)
2168 &mov ($out,&DWP($out_off,"esp"));
2169 &call ("_aesni_encrypt2");
2171 &xorps ($inout0,$inout4); # ^ offset_i
2172 &xorps ($inout1,$inout5);
2173 &movdqa ($rndkey0,$inout5); # pass last offset_i
2174 &movdqa ($rndkey1,$inout3); # pass the checksum
2175 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2176 &movups (&QWP(16*1,$out,$inp),$inout1);
2178 &jmp (&label("done"));
2180 &set_label("three",16);
2181 &lea ($i1,&DWP(1,$block));
2182 &mov ($key,&DWP($key_off,"esp")); # restore key
2185 &movdqu ($inout3,&QWP(0,$l_));
2186 &movdqu ($inout4,&QWP(0,$l_,$i1));
2187 &movdqa ($inout5,$inout3);
2189 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2190 &movdqu ($inout1,&QWP(16*1,$inp));
2191 &movdqu ($inout2,&QWP(16*2,$inp));
2192 &mov ($rounds,&DWP(240,$key));
2194 &pxor ($inout3,$rndkey0); # ^ last offset_i
2195 &pxor ($inout4,$inout3);
2196 &pxor ($inout5,$inout4);
2198 &pxor ($rndkey1,$inout0); # checksum
2199 &pxor ($inout0,$inout3); # ^ offset_i
2200 &pxor ($rndkey1,$inout1);
2201 &pxor ($inout1,$inout4);
2202 &pxor ($rndkey1,$inout2);
2203 &pxor ($inout2,$inout5);
2205 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2206 &mov ($out,&DWP($out_off,"esp"));
2207 &call ("_aesni_encrypt3");
2209 &xorps ($inout0,$inout3); # ^ offset_i
2210 &xorps ($inout1,$inout4);
2211 &xorps ($inout2,$inout5);
2212 &movdqa ($rndkey0,$inout5); # pass last offset_i
2213 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2214 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2215 &movups (&QWP(16*1,$out,$inp),$inout1);
2216 &movups (&QWP(16*2,$out,$inp),$inout2);
2218 &jmp (&label("done"));
2220 &set_label("four",16);
2221 &lea ($i1,&DWP(1,$block));
2222 &lea ($i3,&DWP(3,$block));
2225 &mov ($key,&DWP($key_off,"esp")); # restore key
2228 &movdqu ($inout2,&QWP(0,$l_));
2229 &movdqu ($inout3,&QWP(0,$l_,$i1));
2230 &movdqa ($inout4,$inout2);
2231 &movdqu ($inout5,&QWP(0,$l_,$i3));
2233 &pxor ($inout2,$rndkey0); # ^ last offset_i
2234 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2235 &pxor ($inout3,$inout2);
2236 &movdqu ($inout1,&QWP(16*1,$inp));
2237 &pxor ($inout4,$inout3);
2238 &movdqa (&QWP(16*0,"esp"),$inout2);
2239 &pxor ($inout5,$inout4);
2240 &movdqa (&QWP(16*1,"esp"),$inout3);
2241 &movdqu ($inout2,&QWP(16*2,$inp));
2242 &movdqu ($inout3,&QWP(16*3,$inp));
2243 &mov ($rounds,&DWP(240,$key));
2245 &pxor ($rndkey1,$inout0); # checksum
2246 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2247 &pxor ($rndkey1,$inout1);
2248 &pxor ($inout1,&QWP(16*1,"esp"));
2249 &pxor ($rndkey1,$inout2);
2250 &pxor ($inout2,$inout4);
2251 &pxor ($rndkey1,$inout3);
2252 &pxor ($inout3,$inout5);
2254 &movdqa (&QWP($checksum,"esp"),$rndkey1)
2255 &mov ($out,&DWP($out_off,"esp"));
2256 &call ("_aesni_encrypt4");
2258 &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2259 &xorps ($inout1,&QWP(16*1,"esp"));
2260 &xorps ($inout2,$inout4);
2261 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2262 &xorps ($inout3,$inout5);
2263 &movups (&QWP(16*1,$out,$inp),$inout1);
2264 &movdqa ($rndkey0,$inout5); # pass last offset_i
2265 &movups (&QWP(16*2,$out,$inp),$inout2);
2266 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2267 &movups (&QWP(16*3,$out,$inp),$inout3);
2270 &mov ($key,&DWP($esp_off,"esp"));
2271 &pxor ($inout0,$inout0); # clear register bank
2272 &pxor ($inout1,$inout1);
2273 &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack
2274 &pxor ($inout2,$inout2);
2275 &movdqa (&QWP(16*1,"esp"),$inout0);
2276 &pxor ($inout3,$inout3);
2277 &movdqa (&QWP(16*2,"esp"),$inout0);
2278 &pxor ($inout4,$inout4);
2279 &movdqa (&QWP(16*3,"esp"),$inout0);
2280 &pxor ($inout5,$inout5);
2281 &movdqa (&QWP(16*4,"esp"),$inout0);
2282 &movdqa (&QWP(16*5,"esp"),$inout0);
2283 &movdqa (&QWP(16*6,"esp"),$inout0);
2285 &lea ("esp",&DWP(0,$key));
2286 &mov ($rounds,&wparam(5)); # &offset_i
2287 &mov ($rounds_,&wparam(7)); # &checksum
2288 &movdqu (&QWP(0,$rounds),$rndkey0);
2289 &pxor ($rndkey0,$rndkey0);
2290 &movdqu (&QWP(0,$rounds_),$rndkey1);
2291 &pxor ($rndkey1,$rndkey1);
2292 &function_end("aesni_ocb_encrypt");
2294 &function_begin("aesni_ocb_decrypt");
2295 &mov ($rounds,&wparam(5)); # &offset_i
2296 &mov ($rounds_,&wparam(7)); # &checksum
2298 &mov ($inp,&wparam(0));
2299 &mov ($out,&wparam(1));
2300 &mov ($len,&wparam(2));
2301 &mov ($key,&wparam(3));
2302 &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i
2303 &mov ($block,&wparam(4)); # start_block_num
2304 &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum
2305 &mov ($l_,&wparam(6)); # L_
2307 &mov ($rounds,"esp");
2308 &sub ("esp",$esp_off+4); # alloca
2309 &and ("esp",-16); # align stack
2313 &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6
2314 &mov (&DWP($out_off,"esp"),$out);
2315 &mov (&DWP($end_off,"esp"),$len);
2316 &mov (&DWP($esp_off,"esp"),$rounds);
2318 &mov ($rounds,&DWP(240,$key));
2321 &jnz (&label("odd"));
2326 &movdqu ($inout5,&QWP(0,$l_,$i3));
2327 &mov ($i3,$key); # put aside key
2329 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2330 &lea ($inp,&DWP(16,$inp));
2332 &pxor ($inout5,$rndkey0); # ^ last offset_i
2333 &pxor ($inout0,$inout5); # ^ offset_i
2335 &movdqa ($inout4,$rndkey1);
2337 { &aesni_inline_generate1("dec"); }
2339 { &call ("_aesni_decrypt1"); }
2341 &xorps ($inout0,$inout5); # ^ offset_i
2342 &movaps ($rndkey1,$inout4); # pass the checksum
2343 &movdqa ($rndkey0,$inout5); # pass last offset_i
2344 &xorps ($rndkey1,$inout0); # checksum
2345 &movups (&QWP(-16,$out,$inp),$inout0); # store output
2347 &mov ($rounds,&DWP(240,$i3));
2348 &mov ($key,$i3); # restore key
2349 &mov ($len,&DWP($end_off,"esp"));
2354 &sub ($out,$rounds); # twisted rounds
2355 &mov (&DWP($key_off,"esp"),$key);
2356 &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule
2357 &mov (&DWP($rounds_off,"esp"),$out);
2360 &ja (&label("short"));
2361 &jmp (&label("grandloop"));
2363 &set_label("grandloop",32);
2364 &lea ($i1,&DWP(1,$block));
2365 &lea ($i3,&DWP(3,$block));
2366 &lea ($i5,&DWP(5,$block));
2374 &movdqu ($inout0,&QWP(0,$l_));
2375 &movdqu ($inout1,&QWP(0,$l_,$i1));
2376 &mov ($rounds,&DWP($rounds_off,"esp"));
2377 &movdqa ($inout2,$inout0);
2378 &movdqu ($inout3,&QWP(0,$l_,$i3));
2379 &movdqa ($inout4,$inout0);
2380 &movdqu ($inout5,&QWP(0,$l_,$i5));
2382 &pxor ($inout0,$rndkey0); # ^ last offset_i
2383 &pxor ($inout1,$inout0);
2384 &movdqa (&QWP(16*0,"esp"),$inout0);
2385 &pxor ($inout2,$inout1);
2386 &movdqa (&QWP(16*1,"esp"),$inout1);
2387 &pxor ($inout3,$inout2);
2388 &movdqa (&QWP(16*2,"esp"),$inout2);
2389 &pxor ($inout4,$inout3);
2390 &movdqa (&QWP(16*3,"esp"),$inout3);
2391 &pxor ($inout5,$inout4);
2392 &movdqa (&QWP(16*4,"esp"),$inout4);
2393 &movdqa (&QWP(16*5,"esp"),$inout5);
2395 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
2396 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2397 &movdqu ($inout1,&QWP(16*1,$inp));
2398 &movdqu ($inout2,&QWP(16*2,$inp));
2399 &movdqu ($inout3,&QWP(16*3,$inp));
2400 &movdqu ($inout4,&QWP(16*4,$inp));
2401 &movdqu ($inout5,&QWP(16*5,$inp));
2402 &lea ($inp,&DWP(16*6,$inp));
2404 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2405 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
2406 &pxor ($inout1,$rndkey0);
2407 &pxor ($inout2,$rndkey0);
2408 &pxor ($inout3,$rndkey0);
2409 &pxor ($inout4,$rndkey0);
2410 &pxor ($inout5,$rndkey0);
2412 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
2413 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2414 &pxor ($inout1,&QWP(16*1,"esp"));
2415 &pxor ($inout2,&QWP(16*2,"esp"));
2416 &pxor ($inout3,&QWP(16*3,"esp"));
2417 &pxor ($inout4,&QWP(16*4,"esp"));
2418 &pxor ($inout5,&QWP(16*5,"esp"));
2420 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2421 &aesdec ($inout0,$rndkey1);
2422 &aesdec ($inout1,$rndkey1);
2423 &aesdec ($inout2,$rndkey1);
2424 &aesdec ($inout3,$rndkey1);
2425 &aesdec ($inout4,$rndkey1);
2426 &aesdec ($inout5,$rndkey1);
2428 &mov ($out,&DWP($out_off,"esp"));
2429 &mov ($len,&DWP($end_off,"esp"));
2430 &call ("_aesni_decrypt6_enter");
2432 &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i
2433 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2434 &movdqa ($rndkey1,&QWP($checksum,"esp"));
2435 &pxor ($inout1,&QWP(16*1,"esp"));
2436 &pxor ($inout2,&QWP(16*2,"esp"));
2437 &pxor ($inout3,&QWP(16*3,"esp"));
2438 &pxor ($inout4,&QWP(16*4,"esp"));
2439 &pxor ($inout5,$rndkey0);
2441 &pxor ($rndkey1,$inout0); # checksum
2442 &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output
2443 &pxor ($rndkey1,$inout1);
2444 &movdqu (&QWP(-16*5,$out,$inp),$inout1);
2445 &pxor ($rndkey1,$inout2);
2446 &movdqu (&QWP(-16*4,$out,$inp),$inout2);
2447 &pxor ($rndkey1,$inout3);
2448 &movdqu (&QWP(-16*3,$out,$inp),$inout3);
2449 &pxor ($rndkey1,$inout4);
2450 &movdqu (&QWP(-16*2,$out,$inp),$inout4);
2451 &pxor ($rndkey1,$inout5);
2452 &movdqu (&QWP(-16*1,$out,$inp),$inout5);
2453 &cmp ($inp,$len); # done yet?
2454 &jb (&label("grandloop"));
2456 &set_label("short");
2459 &jz (&label("done"));
2462 &jb (&label("one"));
2463 &je (&label("two"));
2466 &jb (&label("three"));
2467 &je (&label("four"));
2469 &lea ($i1,&DWP(1,$block));
2470 &lea ($i3,&DWP(3,$block));
2475 &movdqu ($inout0,&QWP(0,$l_));
2476 &movdqu ($inout1,&QWP(0,$l_,$i1));
2477 &mov ($rounds,&DWP($rounds_off,"esp"));
2478 &movdqa ($inout2,$inout0);
2479 &movdqu ($inout3,&QWP(0,$l_,$i3));
2480 &movdqa ($inout4,$inout0);
2482 &pxor ($inout0,$rndkey0); # ^ last offset_i
2483 &pxor ($inout1,$inout0);
2484 &movdqa (&QWP(16*0,"esp"),$inout0);
2485 &pxor ($inout2,$inout1);
2486 &movdqa (&QWP(16*1,"esp"),$inout1);
2487 &pxor ($inout3,$inout2);
2488 &movdqa (&QWP(16*2,"esp"),$inout2);
2489 &pxor ($inout4,$inout3);
2490 &movdqa (&QWP(16*3,"esp"),$inout3);
2491 &pxor ($inout5,$inout4);
2492 &movdqa (&QWP(16*4,"esp"),$inout4);
2494 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
2495 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2496 &movdqu ($inout1,&QWP(16*1,$inp));
2497 &movdqu ($inout2,&QWP(16*2,$inp));
2498 &movdqu ($inout3,&QWP(16*3,$inp));
2499 &movdqu ($inout4,&QWP(16*4,$inp));
2500 &pxor ($inout5,$inout5);
2502 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2503 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
2504 &pxor ($inout1,$rndkey0);
2505 &pxor ($inout2,$rndkey0);
2506 &pxor ($inout3,$rndkey0);
2507 &pxor ($inout4,$rndkey0);
2509 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
2510 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2511 &pxor ($inout1,&QWP(16*1,"esp"));
2512 &pxor ($inout2,&QWP(16*2,"esp"));
2513 &pxor ($inout3,&QWP(16*3,"esp"));
2514 &pxor ($inout4,&QWP(16*4,"esp"));
2516 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2517 &aesdec ($inout0,$rndkey1);
2518 &aesdec ($inout1,$rndkey1);
2519 &aesdec ($inout2,$rndkey1);
2520 &aesdec ($inout3,$rndkey1);
2521 &aesdec ($inout4,$rndkey1);
2522 &aesdec ($inout5,$rndkey1);
2524 &mov ($out,&DWP($out_off,"esp"));
2525 &call ("_aesni_decrypt6_enter");
2527 &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i
2528 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2529 &movdqa ($rndkey1,&QWP($checksum,"esp"));
2530 &pxor ($inout1,&QWP(16*1,"esp"));
2531 &pxor ($inout2,&QWP(16*2,"esp"));
2532 &pxor ($inout3,&QWP(16*3,"esp"));
2533 &pxor ($inout4,$rndkey0);
2535 &pxor ($rndkey1,$inout0); # checksum
2536 &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output
2537 &pxor ($rndkey1,$inout1);
2538 &movdqu (&QWP(16*1,$out,$inp),$inout1);
2539 &pxor ($rndkey1,$inout2);
2540 &movdqu (&QWP(16*2,$out,$inp),$inout2);
2541 &pxor ($rndkey1,$inout3);
2542 &movdqu (&QWP(16*3,$out,$inp),$inout3);
2543 &pxor ($rndkey1,$inout4);
2544 &movdqu (&QWP(16*4,$out,$inp),$inout4);
2546 &jmp (&label("done"));
2548 &set_label("one",16);
2549 &movdqu ($inout5,&QWP(0,$l_));
2550 &mov ($key,&DWP($key_off,"esp")); # restore key
2552 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2553 &mov ($rounds,&DWP(240,$key));
2555 &pxor ($inout5,$rndkey0); # ^ last offset_i
2556 &pxor ($inout0,$inout5); # ^ offset_i
2558 &movdqa ($inout4,$rndkey1);
2559 &mov ($out,&DWP($out_off,"esp"));
2561 { &aesni_inline_generate1("dec"); }
2563 { &call ("_aesni_decrypt1"); }
2565 &xorps ($inout0,$inout5); # ^ offset_i
2566 &movaps ($rndkey1,$inout4); # pass the checksum
2567 &movdqa ($rndkey0,$inout5); # pass last offset_i
2568 &xorps ($rndkey1,$inout0); # checksum
2569 &movups (&QWP(0,$out,$inp),$inout0);
2571 &jmp (&label("done"));
2573 &set_label("two",16);
2574 &lea ($i1,&DWP(1,$block));
2575 &mov ($key,&DWP($key_off,"esp")); # restore key
2578 &movdqu ($inout4,&QWP(0,$l_));
2579 &movdqu ($inout5,&QWP(0,$l_,$i1));
2581 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2582 &movdqu ($inout1,&QWP(16*1,$inp));
2583 &mov ($rounds,&DWP(240,$key));
2585 &movdqa ($inout3,$rndkey1);
2586 &pxor ($inout4,$rndkey0); # ^ last offset_i
2587 &pxor ($inout5,$inout4);
2589 &pxor ($inout0,$inout4); # ^ offset_i
2590 &pxor ($inout1,$inout5);
2592 &mov ($out,&DWP($out_off,"esp"));
2593 &call ("_aesni_decrypt2");
2595 &xorps ($inout0,$inout4); # ^ offset_i
2596 &xorps ($inout1,$inout5);
2597 &movdqa ($rndkey0,$inout5); # pass last offset_i
2598 &xorps ($inout3,$inout0); # checksum
2599 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2600 &xorps ($inout3,$inout1);
2601 &movups (&QWP(16*1,$out,$inp),$inout1);
2602 &movaps ($rndkey1,$inout3); # pass the checksum
2604 &jmp (&label("done"));
2606 &set_label("three",16);
2607 &lea ($i1,&DWP(1,$block));
2608 &mov ($key,&DWP($key_off,"esp")); # restore key
2611 &movdqu ($inout3,&QWP(0,$l_));
2612 &movdqu ($inout4,&QWP(0,$l_,$i1));
2613 &movdqa ($inout5,$inout3);
2615 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2616 &movdqu ($inout1,&QWP(16*1,$inp));
2617 &movdqu ($inout2,&QWP(16*2,$inp));
2618 &mov ($rounds,&DWP(240,$key));
2620 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2621 &pxor ($inout3,$rndkey0); # ^ last offset_i
2622 &pxor ($inout4,$inout3);
2623 &pxor ($inout5,$inout4);
2625 &pxor ($inout0,$inout3); # ^ offset_i
2626 &pxor ($inout1,$inout4);
2627 &pxor ($inout2,$inout5);
2629 &mov ($out,&DWP($out_off,"esp"));
2630 &call ("_aesni_decrypt3");
2632 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2633 &xorps ($inout0,$inout3); # ^ offset_i
2634 &xorps ($inout1,$inout4);
2635 &xorps ($inout2,$inout5);
2636 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2637 &pxor ($rndkey1,$inout0); # checksum
2638 &movdqa ($rndkey0,$inout5); # pass last offset_i
2639 &movups (&QWP(16*1,$out,$inp),$inout1);
2640 &pxor ($rndkey1,$inout1);
2641 &movups (&QWP(16*2,$out,$inp),$inout2);
2642 &pxor ($rndkey1,$inout2);
2644 &jmp (&label("done"));
2646 &set_label("four",16);
2647 &lea ($i1,&DWP(1,$block));
2648 &lea ($i3,&DWP(3,$block));
2651 &mov ($key,&DWP($key_off,"esp")); # restore key
2654 &movdqu ($inout2,&QWP(0,$l_));
2655 &movdqu ($inout3,&QWP(0,$l_,$i1));
2656 &movdqa ($inout4,$inout2);
2657 &movdqu ($inout5,&QWP(0,$l_,$i3));
2659 &pxor ($inout2,$rndkey0); # ^ last offset_i
2660 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2661 &pxor ($inout3,$inout2);
2662 &movdqu ($inout1,&QWP(16*1,$inp));
2663 &pxor ($inout4,$inout3);
2664 &movdqa (&QWP(16*0,"esp"),$inout2);
2665 &pxor ($inout5,$inout4);
2666 &movdqa (&QWP(16*1,"esp"),$inout3);
2667 &movdqu ($inout2,&QWP(16*2,$inp));
2668 &movdqu ($inout3,&QWP(16*3,$inp));
2669 &mov ($rounds,&DWP(240,$key));
2671 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2672 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2673 &pxor ($inout1,&QWP(16*1,"esp"));
2674 &pxor ($inout2,$inout4);
2675 &pxor ($inout3,$inout5);
2677 &mov ($out,&DWP($out_off,"esp"));
2678 &call ("_aesni_decrypt4");
2680 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2681 &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2682 &xorps ($inout1,&QWP(16*1,"esp"));
2683 &xorps ($inout2,$inout4);
2684 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2685 &pxor ($rndkey1,$inout0); # checksum
2686 &xorps ($inout3,$inout5);
2687 &movups (&QWP(16*1,$out,$inp),$inout1);
2688 &pxor ($rndkey1,$inout1);
2689 &movdqa ($rndkey0,$inout5); # pass last offset_i
2690 &movups (&QWP(16*2,$out,$inp),$inout2);
2691 &pxor ($rndkey1,$inout2);
2692 &movups (&QWP(16*3,$out,$inp),$inout3);
2693 &pxor ($rndkey1,$inout3);
2696 &mov ($key,&DWP($esp_off,"esp"));
2697 &pxor ($inout0,$inout0); # clear register bank
2698 &pxor ($inout1,$inout1);
2699 &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack
2700 &pxor ($inout2,$inout2);
2701 &movdqa (&QWP(16*1,"esp"),$inout0);
2702 &pxor ($inout3,$inout3);
2703 &movdqa (&QWP(16*2,"esp"),$inout0);
2704 &pxor ($inout4,$inout4);
2705 &movdqa (&QWP(16*3,"esp"),$inout0);
2706 &pxor ($inout5,$inout5);
2707 &movdqa (&QWP(16*4,"esp"),$inout0);
2708 &movdqa (&QWP(16*5,"esp"),$inout0);
2709 &movdqa (&QWP(16*6,"esp"),$inout0);
2711 &lea ("esp",&DWP(0,$key));
2712 &mov ($rounds,&wparam(5)); # &offset_i
2713 &mov ($rounds_,&wparam(7)); # &checksum
2714 &movdqu (&QWP(0,$rounds),$rndkey0);
2715 &pxor ($rndkey0,$rndkey0);
2716 &movdqu (&QWP(0,$rounds_),$rndkey1);
2717 &pxor ($rndkey1,$rndkey1);
2718 &function_end("aesni_ocb_decrypt");
2722 ######################################################################
2723 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
2724 # size_t length, const AES_KEY *key,
2725 # unsigned char *ivp,const int enc);
2726 &function_begin("${PREFIX}_cbc_encrypt");
2727 &mov ($inp,&wparam(0));
2728 &mov ($rounds_,"esp");
2729 &mov ($out,&wparam(1));
2731 &mov ($len,&wparam(2));
2732 &and ($rounds_,-16);
2733 &mov ($key,&wparam(3));
2734 &mov ($key_,&wparam(4));
2736 &jz (&label("cbc_abort"));
2738 &cmp (&wparam(5),0);
2739 &xchg ($rounds_,"esp"); # alloca
2740 &movups ($ivec,&QWP(0,$key_)); # load IV
2741 &mov ($rounds,&DWP(240,$key));
2742 &mov ($key_,$key); # backup $key
2743 &mov (&DWP(16,"esp"),$rounds_); # save original %esp
2744 &mov ($rounds_,$rounds); # backup $rounds
2745 &je (&label("cbc_decrypt"));
2747 &movaps ($inout0,$ivec);
2749 &jb (&label("cbc_enc_tail"));
2751 &jmp (&label("cbc_enc_loop"));
2753 &set_label("cbc_enc_loop",16);
2754 &movups ($ivec,&QWP(0,$inp)); # input actually
2755 &lea ($inp,&DWP(16,$inp));
2757 { &aesni_inline_generate1("enc",$inout0,$ivec); }
2759 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
2760 &mov ($rounds,$rounds_); # restore $rounds
2761 &mov ($key,$key_); # restore $key
2762 &movups (&QWP(0,$out),$inout0); # store output
2763 &lea ($out,&DWP(16,$out));
2765 &jnc (&label("cbc_enc_loop"));
2767 &jnz (&label("cbc_enc_tail"));
2768 &movaps ($ivec,$inout0);
2769 &pxor ($inout0,$inout0);
2770 &jmp (&label("cbc_ret"));
2772 &set_label("cbc_enc_tail");
2773 &mov ("ecx",$len); # zaps $rounds
2774 &data_word(0xA4F3F689); # rep movsb
2775 &mov ("ecx",16); # zero tail
2777 &xor ("eax","eax"); # zaps $len
2778 &data_word(0xAAF3F689); # rep stosb
2779 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
2780 &mov ($rounds,$rounds_); # restore $rounds
2781 &mov ($inp,$out); # $inp and $out are the same
2782 &mov ($key,$key_); # restore $key
2783 &jmp (&label("cbc_enc_loop"));
2784 ######################################################################
2785 &set_label("cbc_decrypt",16);
2787 &jbe (&label("cbc_dec_tail"));
2788 &movaps (&QWP(0,"esp"),$ivec); # save IV
2790 &jmp (&label("cbc_dec_loop6_enter"));
2792 &set_label("cbc_dec_loop6",16);
2793 &movaps (&QWP(0,"esp"),$rndkey0); # save IV
2794 &movups (&QWP(0,$out),$inout5);
2795 &lea ($out,&DWP(0x10,$out));
2796 &set_label("cbc_dec_loop6_enter");
2797 &movdqu ($inout0,&QWP(0,$inp));
2798 &movdqu ($inout1,&QWP(0x10,$inp));
2799 &movdqu ($inout2,&QWP(0x20,$inp));
2800 &movdqu ($inout3,&QWP(0x30,$inp));
2801 &movdqu ($inout4,&QWP(0x40,$inp));
2802 &movdqu ($inout5,&QWP(0x50,$inp));
2804 &call ("_aesni_decrypt6");
2806 &movups ($rndkey1,&QWP(0,$inp));
2807 &movups ($rndkey0,&QWP(0x10,$inp));
2808 &xorps ($inout0,&QWP(0,"esp")); # ^=IV
2809 &xorps ($inout1,$rndkey1);
2810 &movups ($rndkey1,&QWP(0x20,$inp));
2811 &xorps ($inout2,$rndkey0);
2812 &movups ($rndkey0,&QWP(0x30,$inp));
2813 &xorps ($inout3,$rndkey1);
2814 &movups ($rndkey1,&QWP(0x40,$inp));
2815 &xorps ($inout4,$rndkey0);
2816 &movups ($rndkey0,&QWP(0x50,$inp)); # IV
2817 &xorps ($inout5,$rndkey1);
2818 &movups (&QWP(0,$out),$inout0);
2819 &movups (&QWP(0x10,$out),$inout1);
2820 &lea ($inp,&DWP(0x60,$inp));
2821 &movups (&QWP(0x20,$out),$inout2);
2822 &mov ($rounds,$rounds_); # restore $rounds
2823 &movups (&QWP(0x30,$out),$inout3);
2824 &mov ($key,$key_); # restore $key
2825 &movups (&QWP(0x40,$out),$inout4);
2826 &lea ($out,&DWP(0x50,$out));
2828 &ja (&label("cbc_dec_loop6"));
2830 &movaps ($inout0,$inout5);
2831 &movaps ($ivec,$rndkey0);
2833 &jle (&label("cbc_dec_clear_tail_collected"));
2834 &movups (&QWP(0,$out),$inout0);
2835 &lea ($out,&DWP(0x10,$out));
2836 &set_label("cbc_dec_tail");
2837 &movups ($inout0,&QWP(0,$inp));
2838 &movaps ($in0,$inout0);
2840 &jbe (&label("cbc_dec_one"));
2842 &movups ($inout1,&QWP(0x10,$inp));
2843 &movaps ($in1,$inout1);
2845 &jbe (&label("cbc_dec_two"));
2847 &movups ($inout2,&QWP(0x20,$inp));
2849 &jbe (&label("cbc_dec_three"));
2851 &movups ($inout3,&QWP(0x30,$inp));
2853 &jbe (&label("cbc_dec_four"));
2855 &movups ($inout4,&QWP(0x40,$inp));
2856 &movaps (&QWP(0,"esp"),$ivec); # save IV
2857 &movups ($inout0,&QWP(0,$inp));
2858 &xorps ($inout5,$inout5);
2859 &call ("_aesni_decrypt6");
2860 &movups ($rndkey1,&QWP(0,$inp));
2861 &movups ($rndkey0,&QWP(0x10,$inp));
2862 &xorps ($inout0,&QWP(0,"esp")); # ^= IV
2863 &xorps ($inout1,$rndkey1);
2864 &movups ($rndkey1,&QWP(0x20,$inp));
2865 &xorps ($inout2,$rndkey0);
2866 &movups ($rndkey0,&QWP(0x30,$inp));
2867 &xorps ($inout3,$rndkey1);
2868 &movups ($ivec,&QWP(0x40,$inp)); # IV
2869 &xorps ($inout4,$rndkey0);
2870 &movups (&QWP(0,$out),$inout0);
2871 &movups (&QWP(0x10,$out),$inout1);
2872 &pxor ($inout1,$inout1);
2873 &movups (&QWP(0x20,$out),$inout2);
2874 &pxor ($inout2,$inout2);
2875 &movups (&QWP(0x30,$out),$inout3);
2876 &pxor ($inout3,$inout3);
2877 &lea ($out,&DWP(0x40,$out));
2878 &movaps ($inout0,$inout4);
2879 &pxor ($inout4,$inout4);
2881 &jmp (&label("cbc_dec_tail_collected"));
2883 &set_label("cbc_dec_one",16);
2885 { &aesni_inline_generate1("dec"); }
2887 { &call ("_aesni_decrypt1"); }
2888 &xorps ($inout0,$ivec);
2889 &movaps ($ivec,$in0);
2891 &jmp (&label("cbc_dec_tail_collected"));
2893 &set_label("cbc_dec_two",16);
2894 &call ("_aesni_decrypt2");
2895 &xorps ($inout0,$ivec);
2896 &xorps ($inout1,$in0);
2897 &movups (&QWP(0,$out),$inout0);
2898 &movaps ($inout0,$inout1);
2899 &pxor ($inout1,$inout1);
2900 &lea ($out,&DWP(0x10,$out));
2901 &movaps ($ivec,$in1);
2903 &jmp (&label("cbc_dec_tail_collected"));
2905 &set_label("cbc_dec_three",16);
2906 &call ("_aesni_decrypt3");
2907 &xorps ($inout0,$ivec);
2908 &xorps ($inout1,$in0);
2909 &xorps ($inout2,$in1);
2910 &movups (&QWP(0,$out),$inout0);
2911 &movaps ($inout0,$inout2);
2912 &pxor ($inout2,$inout2);
2913 &movups (&QWP(0x10,$out),$inout1);
2914 &pxor ($inout1,$inout1);
2915 &lea ($out,&DWP(0x20,$out));
2916 &movups ($ivec,&QWP(0x20,$inp));
2918 &jmp (&label("cbc_dec_tail_collected"));
2920 &set_label("cbc_dec_four",16);
2921 &call ("_aesni_decrypt4");
2922 &movups ($rndkey1,&QWP(0x10,$inp));
2923 &movups ($rndkey0,&QWP(0x20,$inp));
2924 &xorps ($inout0,$ivec);
2925 &movups ($ivec,&QWP(0x30,$inp));
2926 &xorps ($inout1,$in0);
2927 &movups (&QWP(0,$out),$inout0);
2928 &xorps ($inout2,$rndkey1);
2929 &movups (&QWP(0x10,$out),$inout1);
2930 &pxor ($inout1,$inout1);
2931 &xorps ($inout3,$rndkey0);
2932 &movups (&QWP(0x20,$out),$inout2);
2933 &pxor ($inout2,$inout2);
2934 &lea ($out,&DWP(0x30,$out));
2935 &movaps ($inout0,$inout3);
2936 &pxor ($inout3,$inout3);
2938 &jmp (&label("cbc_dec_tail_collected"));
2940 &set_label("cbc_dec_clear_tail_collected",16);
2941 &pxor ($inout1,$inout1);
2942 &pxor ($inout2,$inout2);
2943 &pxor ($inout3,$inout3);
2944 &pxor ($inout4,$inout4);
2945 &set_label("cbc_dec_tail_collected");
2947 &jnz (&label("cbc_dec_tail_partial"));
2948 &movups (&QWP(0,$out),$inout0);
2949 &pxor ($rndkey0,$rndkey0);
2950 &jmp (&label("cbc_ret"));
2952 &set_label("cbc_dec_tail_partial",16);
2953 &movaps (&QWP(0,"esp"),$inout0);
2954 &pxor ($rndkey0,$rndkey0);
2958 &data_word(0xA4F3F689); # rep movsb
2959 &movdqa (&QWP(0,"esp"),$inout0);
2961 &set_label("cbc_ret");
2962 &mov ("esp",&DWP(16,"esp")); # pull original %esp
2963 &mov ($key_,&wparam(4));
2964 &pxor ($inout0,$inout0);
2965 &pxor ($rndkey1,$rndkey1);
2966 &movups (&QWP(0,$key_),$ivec); # output IV
2967 &pxor ($ivec,$ivec);
2968 &set_label("cbc_abort");
2969 &function_end("${PREFIX}_cbc_encrypt");
2971 ######################################################################
2972 # Mechanical port from aesni-x86_64.pl.
2974 # _aesni_set_encrypt_key is private interface,
2976 # "eax" const unsigned char *userKey
2983 &function_begin_B("_aesni_set_encrypt_key");
2986 &test ("eax","eax");
2987 &jz (&label("bad_pointer"));
2989 &jz (&label("bad_pointer"));
2991 &call (&label("pic"));
2994 &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
2996 &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
2997 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
2998 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
2999 &mov ("ebp",&DWP(4,"ebp"));
3000 &lea ($key,&DWP(16,$key));
3001 &and ("ebp",1<<28|1<<11); # AVX and XOP bits
3003 &je (&label("14rounds"));
3005 &je (&label("12rounds"));
3007 &jne (&label("bad_keybits"));
3009 &set_label("10rounds",16);
3011 &je (&label("10rounds_alt"));
3014 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
3015 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
3016 &call (&label("key_128_cold"));
3017 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
3018 &call (&label("key_128"));
3019 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
3020 &call (&label("key_128"));
3021 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
3022 &call (&label("key_128"));
3023 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
3024 &call (&label("key_128"));
3025 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
3026 &call (&label("key_128"));
3027 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
3028 &call (&label("key_128"));
3029 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
3030 &call (&label("key_128"));
3031 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
3032 &call (&label("key_128"));
3033 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
3034 &call (&label("key_128"));
3035 &$movekey (&QWP(0,$key),"xmm0");
3036 &mov (&DWP(80,$key),$rounds);
3038 &jmp (&label("good_key"));
3040 &set_label("key_128",16);
3041 &$movekey (&QWP(0,$key),"xmm0");
3042 &lea ($key,&DWP(16,$key));
3043 &set_label("key_128_cold");
3044 &shufps ("xmm4","xmm0",0b00010000);
3045 &xorps ("xmm0","xmm4");
3046 &shufps ("xmm4","xmm0",0b10001100);
3047 &xorps ("xmm0","xmm4");
3048 &shufps ("xmm1","xmm1",0b11111111); # critical path
3049 &xorps ("xmm0","xmm1");
3052 &set_label("10rounds_alt",16);
3053 &movdqa ("xmm5",&QWP(0x00,"ebx"));
3055 &movdqa ("xmm4",&QWP(0x20,"ebx"));
3056 &movdqa ("xmm2","xmm0");
3057 &movdqu (&QWP(-16,$key),"xmm0");
3059 &set_label("loop_key128");
3060 &pshufb ("xmm0","xmm5");
3061 &aesenclast ("xmm0","xmm4");
3063 &lea ($key,&DWP(16,$key));
3065 &movdqa ("xmm3","xmm2");
3067 &pxor ("xmm3","xmm2");
3069 &pxor ("xmm3","xmm2");
3071 &pxor ("xmm2","xmm3");
3073 &pxor ("xmm0","xmm2");
3074 &movdqu (&QWP(-16,$key),"xmm0");
3075 &movdqa ("xmm2","xmm0");
3078 &jnz (&label("loop_key128"));
3080 &movdqa ("xmm4",&QWP(0x30,"ebx"));
3082 &pshufb ("xmm0","xmm5");
3083 &aesenclast ("xmm0","xmm4");
3086 &movdqa ("xmm3","xmm2");
3088 &pxor ("xmm3","xmm2");
3090 &pxor ("xmm3","xmm2");
3092 &pxor ("xmm2","xmm3");
3094 &pxor ("xmm0","xmm2");
3095 &movdqu (&QWP(0,$key),"xmm0");
3097 &movdqa ("xmm2","xmm0");
3098 &pshufb ("xmm0","xmm5");
3099 &aesenclast ("xmm0","xmm4");
3101 &movdqa ("xmm3","xmm2");
3103 &pxor ("xmm3","xmm2");
3105 &pxor ("xmm3","xmm2");
3107 &pxor ("xmm2","xmm3");
3109 &pxor ("xmm0","xmm2");
3110 &movdqu (&QWP(16,$key),"xmm0");
3113 &mov (&DWP(96,$key),$rounds);
3115 &jmp (&label("good_key"));
3117 &set_label("12rounds",16);
3118 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
3120 &je (&label("12rounds_alt"));
3123 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
3124 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
3125 &call (&label("key_192a_cold"));
3126 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
3127 &call (&label("key_192b"));
3128 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
3129 &call (&label("key_192a"));
3130 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
3131 &call (&label("key_192b"));
3132 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
3133 &call (&label("key_192a"));
3134 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
3135 &call (&label("key_192b"));
3136 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
3137 &call (&label("key_192a"));
3138 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
3139 &call (&label("key_192b"));
3140 &$movekey (&QWP(0,$key),"xmm0");
3141 &mov (&DWP(48,$key),$rounds);
3143 &jmp (&label("good_key"));
3145 &set_label("key_192a",16);
3146 &$movekey (&QWP(0,$key),"xmm0");
3147 &lea ($key,&DWP(16,$key));
3148 &set_label("key_192a_cold",16);
3149 &movaps ("xmm5","xmm2");
3150 &set_label("key_192b_warm");
3151 &shufps ("xmm4","xmm0",0b00010000);
3152 &movdqa ("xmm3","xmm2");
3153 &xorps ("xmm0","xmm4");
3154 &shufps ("xmm4","xmm0",0b10001100);
3156 &xorps ("xmm0","xmm4");
3157 &pshufd ("xmm1","xmm1",0b01010101); # critical path
3158 &pxor ("xmm2","xmm3");
3159 &pxor ("xmm0","xmm1");
3160 &pshufd ("xmm3","xmm0",0b11111111);
3161 &pxor ("xmm2","xmm3");
3164 &set_label("key_192b",16);
3165 &movaps ("xmm3","xmm0");
3166 &shufps ("xmm5","xmm0",0b01000100);
3167 &$movekey (&QWP(0,$key),"xmm5");
3168 &shufps ("xmm3","xmm2",0b01001110);
3169 &$movekey (&QWP(16,$key),"xmm3");
3170 &lea ($key,&DWP(32,$key));
3171 &jmp (&label("key_192b_warm"));
3173 &set_label("12rounds_alt",16);
3174 &movdqa ("xmm5",&QWP(0x10,"ebx"));
3175 &movdqa ("xmm4",&QWP(0x20,"ebx"));
3177 &movdqu (&QWP(-16,$key),"xmm0");
3179 &set_label("loop_key192");
3180 &movq (&QWP(0,$key),"xmm2");
3181 &movdqa ("xmm1","xmm2");
3182 &pshufb ("xmm2","xmm5");
3183 &aesenclast ("xmm2","xmm4");
3185 &lea ($key,&DWP(24,$key));
3187 &movdqa ("xmm3","xmm0");
3189 &pxor ("xmm3","xmm0");
3191 &pxor ("xmm3","xmm0");
3193 &pxor ("xmm0","xmm3");
3195 &pshufd ("xmm3","xmm0",0xff);
3196 &pxor ("xmm3","xmm1");
3198 &pxor ("xmm3","xmm1");
3200 &pxor ("xmm0","xmm2");
3201 &pxor ("xmm2","xmm3");
3202 &movdqu (&QWP(-16,$key),"xmm0");
3205 &jnz (&label("loop_key192"));
3208 &mov (&DWP(32,$key),$rounds);
3210 &jmp (&label("good_key"));
3212 &set_label("14rounds",16);
3213 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
3214 &lea ($key,&DWP(16,$key));
3216 &je (&label("14rounds_alt"));
3219 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
3220 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
3221 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
3222 &call (&label("key_256a_cold"));
3223 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
3224 &call (&label("key_256b"));
3225 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
3226 &call (&label("key_256a"));
3227 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
3228 &call (&label("key_256b"));
3229 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
3230 &call (&label("key_256a"));
3231 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
3232 &call (&label("key_256b"));
3233 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
3234 &call (&label("key_256a"));
3235 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
3236 &call (&label("key_256b"));
3237 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
3238 &call (&label("key_256a"));
3239 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
3240 &call (&label("key_256b"));
3241 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
3242 &call (&label("key_256a"));
3243 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
3244 &call (&label("key_256b"));
3245 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
3246 &call (&label("key_256a"));
3247 &$movekey (&QWP(0,$key),"xmm0");
3248 &mov (&DWP(16,$key),$rounds);
3251 &jmp (&label("good_key"));
3253 &set_label("key_256a",16);
3254 &$movekey (&QWP(0,$key),"xmm2");
3255 &lea ($key,&DWP(16,$key));
3256 &set_label("key_256a_cold");
3257 &shufps ("xmm4","xmm0",0b00010000);
3258 &xorps ("xmm0","xmm4");
3259 &shufps ("xmm4","xmm0",0b10001100);
3260 &xorps ("xmm0","xmm4");
3261 &shufps ("xmm1","xmm1",0b11111111); # critical path
3262 &xorps ("xmm0","xmm1");
3265 &set_label("key_256b",16);
3266 &$movekey (&QWP(0,$key),"xmm0");
3267 &lea ($key,&DWP(16,$key));
3269 &shufps ("xmm4","xmm2",0b00010000);
3270 &xorps ("xmm2","xmm4");
3271 &shufps ("xmm4","xmm2",0b10001100);
3272 &xorps ("xmm2","xmm4");
3273 &shufps ("xmm1","xmm1",0b10101010); # critical path
3274 &xorps ("xmm2","xmm1");
3277 &set_label("14rounds_alt",16);
3278 &movdqa ("xmm5",&QWP(0x00,"ebx"));
3279 &movdqa ("xmm4",&QWP(0x20,"ebx"));
3281 &movdqu (&QWP(-32,$key),"xmm0");
3282 &movdqa ("xmm1","xmm2");
3283 &movdqu (&QWP(-16,$key),"xmm2");
3285 &set_label("loop_key256");
3286 &pshufb ("xmm2","xmm5");
3287 &aesenclast ("xmm2","xmm4");
3289 &movdqa ("xmm3","xmm0");
3291 &pxor ("xmm3","xmm0");
3293 &pxor ("xmm3","xmm0");
3295 &pxor ("xmm0","xmm3");
3298 &pxor ("xmm0","xmm2");
3299 &movdqu (&QWP(0,$key),"xmm0");
3302 &jz (&label("done_key256"));
3304 &pshufd ("xmm2","xmm0",0xff);
3305 &pxor ("xmm3","xmm3");
3306 &aesenclast ("xmm2","xmm3");
3308 &movdqa ("xmm3","xmm1");
3310 &pxor ("xmm3","xmm1");
3312 &pxor ("xmm3","xmm1");
3314 &pxor ("xmm1","xmm3");
3316 &pxor ("xmm2","xmm1");
3317 &movdqu (&QWP(16,$key),"xmm2");
3318 &lea ($key,&DWP(32,$key));
3319 &movdqa ("xmm1","xmm2");
3320 &jmp (&label("loop_key256"));
3322 &set_label("done_key256");
3324 &mov (&DWP(16,$key),$rounds);
3326 &set_label("good_key");
3327 &pxor ("xmm0","xmm0");
3328 &pxor ("xmm1","xmm1");
3329 &pxor ("xmm2","xmm2");
3330 &pxor ("xmm3","xmm3");
3331 &pxor ("xmm4","xmm4");
3332 &pxor ("xmm5","xmm5");
3338 &set_label("bad_pointer",4);
3343 &set_label("bad_keybits",4);
3344 &pxor ("xmm0","xmm0");
3349 &function_end_B("_aesni_set_encrypt_key");
3351 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
3353 &function_begin_B("${PREFIX}_set_encrypt_key");
3354 &mov ("eax",&wparam(0));
3355 &mov ($rounds,&wparam(1));
3356 &mov ($key,&wparam(2));
3357 &call ("_aesni_set_encrypt_key");
3359 &function_end_B("${PREFIX}_set_encrypt_key");
3361 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
3363 &function_begin_B("${PREFIX}_set_decrypt_key");
3364 &mov ("eax",&wparam(0));
3365 &mov ($rounds,&wparam(1));
3366 &mov ($key,&wparam(2));
3367 &call ("_aesni_set_encrypt_key");
3368 &mov ($key,&wparam(2));
3369 &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key
3370 &test ("eax","eax");
3371 &jnz (&label("dec_key_ret"));
3372 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
3374 &$movekey ("xmm0",&QWP(0,$key)); # just swap
3375 &$movekey ("xmm1",&QWP(0,"eax"));
3376 &$movekey (&QWP(0,"eax"),"xmm0");
3377 &$movekey (&QWP(0,$key),"xmm1");
3378 &lea ($key,&DWP(16,$key));
3379 &lea ("eax",&DWP(-16,"eax"));
3381 &set_label("dec_key_inverse");
3382 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
3383 &$movekey ("xmm1",&QWP(0,"eax"));
3384 &aesimc ("xmm0","xmm0");
3385 &aesimc ("xmm1","xmm1");
3386 &lea ($key,&DWP(16,$key));
3387 &lea ("eax",&DWP(-16,"eax"));
3388 &$movekey (&QWP(16,"eax"),"xmm0");
3389 &$movekey (&QWP(-16,$key),"xmm1");
3391 &ja (&label("dec_key_inverse"));
3393 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
3394 &aesimc ("xmm0","xmm0");
3395 &$movekey (&QWP(0,$key),"xmm0");
3397 &pxor ("xmm0","xmm0");
3398 &pxor ("xmm1","xmm1");
3399 &xor ("eax","eax"); # return success
3400 &set_label("dec_key_ret");
3402 &function_end_B("${PREFIX}_set_decrypt_key");
3404 &set_label("key_const",64);
3405 &data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
3406 &data_word(0x04070605,0x04070605,0x04070605,0x04070605);
3407 &data_word(1,1,1,1);
3408 &data_word(0x1b,0x1b,0x1b,0x1b);
3409 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
3413 close STDOUT or die "error closing STDOUT";