2 # Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # This module implements support for Intel AES-NI extension. In
18 # OpenSSL context it's used with Intel engine, but can also be used as
19 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
24 # To start with see corresponding paragraph in aesni-x86_64.pl...
25 # Instead of filling table similar to one found there I've chosen to
26 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
27 # The simplified table below represents 32-bit performance relative
28 # to 64-bit one in every given point. Ratios vary for different
29 # encryption modes, therefore interval values.
31 # 16-byte 64-byte 256-byte 1-KB 8-KB
32 # 53-67% 67-84% 91-94% 95-98% 97-99.5%
34 # Lower ratios for smaller block sizes are perfectly understandable,
35 # because function call overhead is higher in 32-bit mode. Largest
36 # 8-KB block performance is virtually same: 32-bit code is less than
37 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
41 # See aesni-x86_64.pl for details. Unlike x86_64 version this module
42 # interleaves at most 6 aes[enc|dec] instructions, because there are
43 # not enough registers for 8x interleave [which should be optimal for
44 # Sandy Bridge]. Actually, performance results for 6x interleave
45 # factor presented in aesni-x86_64.pl (except for CTR) are for this
50 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
51 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
55 # Add aesni_ocb_[en|de]crypt.
57 ######################################################################
58 # Current large-block performance in cycles per byte processed with
59 # 128-bit key (less is better).
61 # CBC en-/decrypt CTR XTS ECB OCB
62 # Westmere 3.77/1.37 1.37 1.52 1.27
63 # * Bridge 5.07/0.98 0.99 1.09 0.91 1.10
64 # Haswell 4.44/0.80 0.97 1.03 0.72 0.76
65 # Skylake 2.68/0.65 0.65 0.66 0.64 0.66
66 # Silvermont 5.77/3.56 3.67 4.03 3.46 4.03
67 # Goldmont 3.84/1.39 1.39 1.63 1.31 1.70
68 # Bulldozer 5.80/0.98 1.05 1.24 0.93 1.23
70 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
71 # generates drop-in replacement for
72 # crypto/aes/asm/aes-586.pl:-)
73 $inline=1; # inline _aesni_[en|de]crypt
75 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
76 push(@INC,"${dir}","${dir}../../perlasm");
85 &external_label("OPENSSL_ia32cap_P");
86 &static_label("key_const");
88 if ($PREFIX eq "aesni") { $movekey=\&movups; }
89 else { $movekey=\&movups; }
96 $rounds_="ebx"; # backup copy for $rounds
97 $key_="ebp"; # backup copy for $key
104 $inout3="xmm5"; $in1="xmm5";
105 $inout4="xmm6"; $in0="xmm6";
106 $inout5="xmm7"; $ivec="xmm7";
110 { my($dst,$src,$imm)=@_;
111 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
112 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
115 { my($opcodelet,$dst,$src)=@_;
116 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
117 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
119 sub aesimc { aescommon(0xdb,@_); }
120 sub aesenc { aescommon(0xdc,@_); }
121 sub aesenclast { aescommon(0xdd,@_); }
122 sub aesdec { aescommon(0xde,@_); }
123 sub aesdeclast { aescommon(0xdf,@_); }
125 # Inline version of internal aesni_[en|de]crypt1
127 sub aesni_inline_generate1
128 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
131 &$movekey ($rndkey0,&QWP(0,$key));
132 &$movekey ($rndkey1,&QWP(16,$key));
133 &xorps ($ivec,$rndkey0) if (defined($ivec));
134 &lea ($key,&DWP(32,$key));
135 &xorps ($inout,$ivec) if (defined($ivec));
136 &xorps ($inout,$rndkey0) if (!defined($ivec));
137 &set_label("${p}1_loop_$sn");
138 eval"&aes${p} ($inout,$rndkey1)";
140 &$movekey ($rndkey1,&QWP(0,$key));
141 &lea ($key,&DWP(16,$key));
142 &jnz (&label("${p}1_loop_$sn"));
143 eval"&aes${p}last ($inout,$rndkey1)";
146 sub aesni_generate1 # fully unrolled loop
147 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
149 &function_begin_B("_aesni_${p}rypt1");
150 &movups ($rndkey0,&QWP(0,$key));
151 &$movekey ($rndkey1,&QWP(0x10,$key));
152 &xorps ($inout,$rndkey0);
153 &$movekey ($rndkey0,&QWP(0x20,$key));
154 &lea ($key,&DWP(0x30,$key));
156 &jb (&label("${p}128"));
157 &lea ($key,&DWP(0x20,$key));
158 &je (&label("${p}192"));
159 &lea ($key,&DWP(0x20,$key));
160 eval"&aes${p} ($inout,$rndkey1)";
161 &$movekey ($rndkey1,&QWP(-0x40,$key));
162 eval"&aes${p} ($inout,$rndkey0)";
163 &$movekey ($rndkey0,&QWP(-0x30,$key));
164 &set_label("${p}192");
165 eval"&aes${p} ($inout,$rndkey1)";
166 &$movekey ($rndkey1,&QWP(-0x20,$key));
167 eval"&aes${p} ($inout,$rndkey0)";
168 &$movekey ($rndkey0,&QWP(-0x10,$key));
169 &set_label("${p}128");
170 eval"&aes${p} ($inout,$rndkey1)";
171 &$movekey ($rndkey1,&QWP(0,$key));
172 eval"&aes${p} ($inout,$rndkey0)";
173 &$movekey ($rndkey0,&QWP(0x10,$key));
174 eval"&aes${p} ($inout,$rndkey1)";
175 &$movekey ($rndkey1,&QWP(0x20,$key));
176 eval"&aes${p} ($inout,$rndkey0)";
177 &$movekey ($rndkey0,&QWP(0x30,$key));
178 eval"&aes${p} ($inout,$rndkey1)";
179 &$movekey ($rndkey1,&QWP(0x40,$key));
180 eval"&aes${p} ($inout,$rndkey0)";
181 &$movekey ($rndkey0,&QWP(0x50,$key));
182 eval"&aes${p} ($inout,$rndkey1)";
183 &$movekey ($rndkey1,&QWP(0x60,$key));
184 eval"&aes${p} ($inout,$rndkey0)";
185 &$movekey ($rndkey0,&QWP(0x70,$key));
186 eval"&aes${p} ($inout,$rndkey1)";
187 eval"&aes${p}last ($inout,$rndkey0)";
189 &function_end_B("_aesni_${p}rypt1");
192 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
193 &aesni_generate1("enc") if (!$inline);
194 &function_begin_B("${PREFIX}_encrypt");
195 &mov ("eax",&wparam(0));
196 &mov ($key,&wparam(2));
197 &movups ($inout0,&QWP(0,"eax"));
198 &mov ($rounds,&DWP(240,$key));
199 &mov ("eax",&wparam(1));
201 { &aesni_inline_generate1("enc"); }
203 { &call ("_aesni_encrypt1"); }
204 &pxor ($rndkey0,$rndkey0); # clear register bank
205 &pxor ($rndkey1,$rndkey1);
206 &movups (&QWP(0,"eax"),$inout0);
207 &pxor ($inout0,$inout0);
209 &function_end_B("${PREFIX}_encrypt");
211 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
212 &aesni_generate1("dec") if(!$inline);
213 &function_begin_B("${PREFIX}_decrypt");
214 &mov ("eax",&wparam(0));
215 &mov ($key,&wparam(2));
216 &movups ($inout0,&QWP(0,"eax"));
217 &mov ($rounds,&DWP(240,$key));
218 &mov ("eax",&wparam(1));
220 { &aesni_inline_generate1("dec"); }
222 { &call ("_aesni_decrypt1"); }
223 &pxor ($rndkey0,$rndkey0); # clear register bank
224 &pxor ($rndkey1,$rndkey1);
225 &movups (&QWP(0,"eax"),$inout0);
226 &pxor ($inout0,$inout0);
228 &function_end_B("${PREFIX}_decrypt");
230 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
231 # factor. Why 3x subroutine were originally used in loops? Even though
232 # aes[enc|dec] latency was originally 6, it could be scheduled only
233 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
234 # utilization, i.e. when subroutine's throughput is virtually same as
235 # of non-interleaved subroutine [for number of input blocks up to 3].
236 # This is why it originally made no sense to implement 2x subroutine.
237 # But times change and it became appropriate to spend extra 192 bytes
238 # on 2x subroutine on Atom Silvermont account. For processors that
239 # can schedule aes[enc|dec] every cycle optimal interleave factor
240 # equals to corresponding instructions latency. 8x is optimal for
241 # * Bridge, but it's unfeasible to accommodate such implementation
242 # in XMM registers addreassable in 32-bit mode and therefore maximum
243 # of 6x is used instead...
248 &function_begin_B("_aesni_${p}rypt2");
249 &$movekey ($rndkey0,&QWP(0,$key));
251 &$movekey ($rndkey1,&QWP(16,$key));
252 &xorps ($inout0,$rndkey0);
253 &pxor ($inout1,$rndkey0);
254 &$movekey ($rndkey0,&QWP(32,$key));
255 &lea ($key,&DWP(32,$key,$rounds));
259 &set_label("${p}2_loop");
260 eval"&aes${p} ($inout0,$rndkey1)";
261 eval"&aes${p} ($inout1,$rndkey1)";
262 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
264 eval"&aes${p} ($inout0,$rndkey0)";
265 eval"&aes${p} ($inout1,$rndkey0)";
266 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
267 &jnz (&label("${p}2_loop"));
268 eval"&aes${p} ($inout0,$rndkey1)";
269 eval"&aes${p} ($inout1,$rndkey1)";
270 eval"&aes${p}last ($inout0,$rndkey0)";
271 eval"&aes${p}last ($inout1,$rndkey0)";
273 &function_end_B("_aesni_${p}rypt2");
279 &function_begin_B("_aesni_${p}rypt3");
280 &$movekey ($rndkey0,&QWP(0,$key));
282 &$movekey ($rndkey1,&QWP(16,$key));
283 &xorps ($inout0,$rndkey0);
284 &pxor ($inout1,$rndkey0);
285 &pxor ($inout2,$rndkey0);
286 &$movekey ($rndkey0,&QWP(32,$key));
287 &lea ($key,&DWP(32,$key,$rounds));
291 &set_label("${p}3_loop");
292 eval"&aes${p} ($inout0,$rndkey1)";
293 eval"&aes${p} ($inout1,$rndkey1)";
294 eval"&aes${p} ($inout2,$rndkey1)";
295 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
297 eval"&aes${p} ($inout0,$rndkey0)";
298 eval"&aes${p} ($inout1,$rndkey0)";
299 eval"&aes${p} ($inout2,$rndkey0)";
300 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
301 &jnz (&label("${p}3_loop"));
302 eval"&aes${p} ($inout0,$rndkey1)";
303 eval"&aes${p} ($inout1,$rndkey1)";
304 eval"&aes${p} ($inout2,$rndkey1)";
305 eval"&aes${p}last ($inout0,$rndkey0)";
306 eval"&aes${p}last ($inout1,$rndkey0)";
307 eval"&aes${p}last ($inout2,$rndkey0)";
309 &function_end_B("_aesni_${p}rypt3");
312 # 4x interleave is implemented to improve small block performance,
313 # most notably [and naturally] 4 block by ~30%. One can argue that one
314 # should have implemented 5x as well, but improvement would be <20%,
315 # so it's not worth it...
319 &function_begin_B("_aesni_${p}rypt4");
320 &$movekey ($rndkey0,&QWP(0,$key));
321 &$movekey ($rndkey1,&QWP(16,$key));
323 &xorps ($inout0,$rndkey0);
324 &pxor ($inout1,$rndkey0);
325 &pxor ($inout2,$rndkey0);
326 &pxor ($inout3,$rndkey0);
327 &$movekey ($rndkey0,&QWP(32,$key));
328 &lea ($key,&DWP(32,$key,$rounds));
330 &data_byte (0x0f,0x1f,0x40,0x00);
333 &set_label("${p}4_loop");
334 eval"&aes${p} ($inout0,$rndkey1)";
335 eval"&aes${p} ($inout1,$rndkey1)";
336 eval"&aes${p} ($inout2,$rndkey1)";
337 eval"&aes${p} ($inout3,$rndkey1)";
338 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
340 eval"&aes${p} ($inout0,$rndkey0)";
341 eval"&aes${p} ($inout1,$rndkey0)";
342 eval"&aes${p} ($inout2,$rndkey0)";
343 eval"&aes${p} ($inout3,$rndkey0)";
344 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
345 &jnz (&label("${p}4_loop"));
347 eval"&aes${p} ($inout0,$rndkey1)";
348 eval"&aes${p} ($inout1,$rndkey1)";
349 eval"&aes${p} ($inout2,$rndkey1)";
350 eval"&aes${p} ($inout3,$rndkey1)";
351 eval"&aes${p}last ($inout0,$rndkey0)";
352 eval"&aes${p}last ($inout1,$rndkey0)";
353 eval"&aes${p}last ($inout2,$rndkey0)";
354 eval"&aes${p}last ($inout3,$rndkey0)";
356 &function_end_B("_aesni_${p}rypt4");
362 &function_begin_B("_aesni_${p}rypt6");
363 &static_label("_aesni_${p}rypt6_enter");
364 &$movekey ($rndkey0,&QWP(0,$key));
366 &$movekey ($rndkey1,&QWP(16,$key));
367 &xorps ($inout0,$rndkey0);
368 &pxor ($inout1,$rndkey0); # pxor does better here
369 &pxor ($inout2,$rndkey0);
370 eval"&aes${p} ($inout0,$rndkey1)";
371 &pxor ($inout3,$rndkey0);
372 &pxor ($inout4,$rndkey0);
373 eval"&aes${p} ($inout1,$rndkey1)";
374 &lea ($key,&DWP(32,$key,$rounds));
376 eval"&aes${p} ($inout2,$rndkey1)";
377 &pxor ($inout5,$rndkey0);
378 &$movekey ($rndkey0,&QWP(0,$key,$rounds));
380 &jmp (&label("_aesni_${p}rypt6_inner"));
382 &set_label("${p}6_loop",16);
383 eval"&aes${p} ($inout0,$rndkey1)";
384 eval"&aes${p} ($inout1,$rndkey1)";
385 eval"&aes${p} ($inout2,$rndkey1)";
386 &set_label("_aesni_${p}rypt6_inner");
387 eval"&aes${p} ($inout3,$rndkey1)";
388 eval"&aes${p} ($inout4,$rndkey1)";
389 eval"&aes${p} ($inout5,$rndkey1)";
390 &set_label("_aesni_${p}rypt6_enter");
391 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
393 eval"&aes${p} ($inout0,$rndkey0)";
394 eval"&aes${p} ($inout1,$rndkey0)";
395 eval"&aes${p} ($inout2,$rndkey0)";
396 eval"&aes${p} ($inout3,$rndkey0)";
397 eval"&aes${p} ($inout4,$rndkey0)";
398 eval"&aes${p} ($inout5,$rndkey0)";
399 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
400 &jnz (&label("${p}6_loop"));
402 eval"&aes${p} ($inout0,$rndkey1)";
403 eval"&aes${p} ($inout1,$rndkey1)";
404 eval"&aes${p} ($inout2,$rndkey1)";
405 eval"&aes${p} ($inout3,$rndkey1)";
406 eval"&aes${p} ($inout4,$rndkey1)";
407 eval"&aes${p} ($inout5,$rndkey1)";
408 eval"&aes${p}last ($inout0,$rndkey0)";
409 eval"&aes${p}last ($inout1,$rndkey0)";
410 eval"&aes${p}last ($inout2,$rndkey0)";
411 eval"&aes${p}last ($inout3,$rndkey0)";
412 eval"&aes${p}last ($inout4,$rndkey0)";
413 eval"&aes${p}last ($inout5,$rndkey0)";
415 &function_end_B("_aesni_${p}rypt6");
417 &aesni_generate2("enc") if ($PREFIX eq "aesni");
418 &aesni_generate2("dec");
419 &aesni_generate3("enc") if ($PREFIX eq "aesni");
420 &aesni_generate3("dec");
421 &aesni_generate4("enc") if ($PREFIX eq "aesni");
422 &aesni_generate4("dec");
423 &aesni_generate6("enc") if ($PREFIX eq "aesni");
424 &aesni_generate6("dec");
426 if ($PREFIX eq "aesni") {
427 ######################################################################
428 # void aesni_ecb_encrypt (const void *in, void *out,
429 # size_t length, const AES_KEY *key,
431 &function_begin("aesni_ecb_encrypt");
432 &mov ($inp,&wparam(0));
433 &mov ($out,&wparam(1));
434 &mov ($len,&wparam(2));
435 &mov ($key,&wparam(3));
436 &mov ($rounds_,&wparam(4));
438 &jz (&label("ecb_ret"));
439 &mov ($rounds,&DWP(240,$key));
440 &test ($rounds_,$rounds_);
441 &jz (&label("ecb_decrypt"));
443 &mov ($key_,$key); # backup $key
444 &mov ($rounds_,$rounds); # backup $rounds
446 &jb (&label("ecb_enc_tail"));
448 &movdqu ($inout0,&QWP(0,$inp));
449 &movdqu ($inout1,&QWP(0x10,$inp));
450 &movdqu ($inout2,&QWP(0x20,$inp));
451 &movdqu ($inout3,&QWP(0x30,$inp));
452 &movdqu ($inout4,&QWP(0x40,$inp));
453 &movdqu ($inout5,&QWP(0x50,$inp));
454 &lea ($inp,&DWP(0x60,$inp));
456 &jmp (&label("ecb_enc_loop6_enter"));
458 &set_label("ecb_enc_loop6",16);
459 &movups (&QWP(0,$out),$inout0);
460 &movdqu ($inout0,&QWP(0,$inp));
461 &movups (&QWP(0x10,$out),$inout1);
462 &movdqu ($inout1,&QWP(0x10,$inp));
463 &movups (&QWP(0x20,$out),$inout2);
464 &movdqu ($inout2,&QWP(0x20,$inp));
465 &movups (&QWP(0x30,$out),$inout3);
466 &movdqu ($inout3,&QWP(0x30,$inp));
467 &movups (&QWP(0x40,$out),$inout4);
468 &movdqu ($inout4,&QWP(0x40,$inp));
469 &movups (&QWP(0x50,$out),$inout5);
470 &lea ($out,&DWP(0x60,$out));
471 &movdqu ($inout5,&QWP(0x50,$inp));
472 &lea ($inp,&DWP(0x60,$inp));
473 &set_label("ecb_enc_loop6_enter");
475 &call ("_aesni_encrypt6");
477 &mov ($key,$key_); # restore $key
478 &mov ($rounds,$rounds_); # restore $rounds
480 &jnc (&label("ecb_enc_loop6"));
482 &movups (&QWP(0,$out),$inout0);
483 &movups (&QWP(0x10,$out),$inout1);
484 &movups (&QWP(0x20,$out),$inout2);
485 &movups (&QWP(0x30,$out),$inout3);
486 &movups (&QWP(0x40,$out),$inout4);
487 &movups (&QWP(0x50,$out),$inout5);
488 &lea ($out,&DWP(0x60,$out));
490 &jz (&label("ecb_ret"));
492 &set_label("ecb_enc_tail");
493 &movups ($inout0,&QWP(0,$inp));
495 &jb (&label("ecb_enc_one"));
496 &movups ($inout1,&QWP(0x10,$inp));
497 &je (&label("ecb_enc_two"));
498 &movups ($inout2,&QWP(0x20,$inp));
500 &jb (&label("ecb_enc_three"));
501 &movups ($inout3,&QWP(0x30,$inp));
502 &je (&label("ecb_enc_four"));
503 &movups ($inout4,&QWP(0x40,$inp));
504 &xorps ($inout5,$inout5);
505 &call ("_aesni_encrypt6");
506 &movups (&QWP(0,$out),$inout0);
507 &movups (&QWP(0x10,$out),$inout1);
508 &movups (&QWP(0x20,$out),$inout2);
509 &movups (&QWP(0x30,$out),$inout3);
510 &movups (&QWP(0x40,$out),$inout4);
511 jmp (&label("ecb_ret"));
513 &set_label("ecb_enc_one",16);
515 { &aesni_inline_generate1("enc"); }
517 { &call ("_aesni_encrypt1"); }
518 &movups (&QWP(0,$out),$inout0);
519 &jmp (&label("ecb_ret"));
521 &set_label("ecb_enc_two",16);
522 &call ("_aesni_encrypt2");
523 &movups (&QWP(0,$out),$inout0);
524 &movups (&QWP(0x10,$out),$inout1);
525 &jmp (&label("ecb_ret"));
527 &set_label("ecb_enc_three",16);
528 &call ("_aesni_encrypt3");
529 &movups (&QWP(0,$out),$inout0);
530 &movups (&QWP(0x10,$out),$inout1);
531 &movups (&QWP(0x20,$out),$inout2);
532 &jmp (&label("ecb_ret"));
534 &set_label("ecb_enc_four",16);
535 &call ("_aesni_encrypt4");
536 &movups (&QWP(0,$out),$inout0);
537 &movups (&QWP(0x10,$out),$inout1);
538 &movups (&QWP(0x20,$out),$inout2);
539 &movups (&QWP(0x30,$out),$inout3);
540 &jmp (&label("ecb_ret"));
541 ######################################################################
542 &set_label("ecb_decrypt",16);
543 &mov ($key_,$key); # backup $key
544 &mov ($rounds_,$rounds); # backup $rounds
546 &jb (&label("ecb_dec_tail"));
548 &movdqu ($inout0,&QWP(0,$inp));
549 &movdqu ($inout1,&QWP(0x10,$inp));
550 &movdqu ($inout2,&QWP(0x20,$inp));
551 &movdqu ($inout3,&QWP(0x30,$inp));
552 &movdqu ($inout4,&QWP(0x40,$inp));
553 &movdqu ($inout5,&QWP(0x50,$inp));
554 &lea ($inp,&DWP(0x60,$inp));
556 &jmp (&label("ecb_dec_loop6_enter"));
558 &set_label("ecb_dec_loop6",16);
559 &movups (&QWP(0,$out),$inout0);
560 &movdqu ($inout0,&QWP(0,$inp));
561 &movups (&QWP(0x10,$out),$inout1);
562 &movdqu ($inout1,&QWP(0x10,$inp));
563 &movups (&QWP(0x20,$out),$inout2);
564 &movdqu ($inout2,&QWP(0x20,$inp));
565 &movups (&QWP(0x30,$out),$inout3);
566 &movdqu ($inout3,&QWP(0x30,$inp));
567 &movups (&QWP(0x40,$out),$inout4);
568 &movdqu ($inout4,&QWP(0x40,$inp));
569 &movups (&QWP(0x50,$out),$inout5);
570 &lea ($out,&DWP(0x60,$out));
571 &movdqu ($inout5,&QWP(0x50,$inp));
572 &lea ($inp,&DWP(0x60,$inp));
573 &set_label("ecb_dec_loop6_enter");
575 &call ("_aesni_decrypt6");
577 &mov ($key,$key_); # restore $key
578 &mov ($rounds,$rounds_); # restore $rounds
580 &jnc (&label("ecb_dec_loop6"));
582 &movups (&QWP(0,$out),$inout0);
583 &movups (&QWP(0x10,$out),$inout1);
584 &movups (&QWP(0x20,$out),$inout2);
585 &movups (&QWP(0x30,$out),$inout3);
586 &movups (&QWP(0x40,$out),$inout4);
587 &movups (&QWP(0x50,$out),$inout5);
588 &lea ($out,&DWP(0x60,$out));
590 &jz (&label("ecb_ret"));
592 &set_label("ecb_dec_tail");
593 &movups ($inout0,&QWP(0,$inp));
595 &jb (&label("ecb_dec_one"));
596 &movups ($inout1,&QWP(0x10,$inp));
597 &je (&label("ecb_dec_two"));
598 &movups ($inout2,&QWP(0x20,$inp));
600 &jb (&label("ecb_dec_three"));
601 &movups ($inout3,&QWP(0x30,$inp));
602 &je (&label("ecb_dec_four"));
603 &movups ($inout4,&QWP(0x40,$inp));
604 &xorps ($inout5,$inout5);
605 &call ("_aesni_decrypt6");
606 &movups (&QWP(0,$out),$inout0);
607 &movups (&QWP(0x10,$out),$inout1);
608 &movups (&QWP(0x20,$out),$inout2);
609 &movups (&QWP(0x30,$out),$inout3);
610 &movups (&QWP(0x40,$out),$inout4);
611 &jmp (&label("ecb_ret"));
613 &set_label("ecb_dec_one",16);
615 { &aesni_inline_generate1("dec"); }
617 { &call ("_aesni_decrypt1"); }
618 &movups (&QWP(0,$out),$inout0);
619 &jmp (&label("ecb_ret"));
621 &set_label("ecb_dec_two",16);
622 &call ("_aesni_decrypt2");
623 &movups (&QWP(0,$out),$inout0);
624 &movups (&QWP(0x10,$out),$inout1);
625 &jmp (&label("ecb_ret"));
627 &set_label("ecb_dec_three",16);
628 &call ("_aesni_decrypt3");
629 &movups (&QWP(0,$out),$inout0);
630 &movups (&QWP(0x10,$out),$inout1);
631 &movups (&QWP(0x20,$out),$inout2);
632 &jmp (&label("ecb_ret"));
634 &set_label("ecb_dec_four",16);
635 &call ("_aesni_decrypt4");
636 &movups (&QWP(0,$out),$inout0);
637 &movups (&QWP(0x10,$out),$inout1);
638 &movups (&QWP(0x20,$out),$inout2);
639 &movups (&QWP(0x30,$out),$inout3);
641 &set_label("ecb_ret");
642 &pxor ("xmm0","xmm0"); # clear register bank
643 &pxor ("xmm1","xmm1");
644 &pxor ("xmm2","xmm2");
645 &pxor ("xmm3","xmm3");
646 &pxor ("xmm4","xmm4");
647 &pxor ("xmm5","xmm5");
648 &pxor ("xmm6","xmm6");
649 &pxor ("xmm7","xmm7");
650 &function_end("aesni_ecb_encrypt");
652 ######################################################################
653 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
654 # size_t blocks, const AES_KEY *key,
655 # const char *ivec,char *cmac);
657 # Handles only complete blocks, operates on 64-bit counter and
658 # does not update *ivec! Nor does it finalize CMAC value
659 # (see engine/eng_aesni.c for details)
662 &function_begin("aesni_ccm64_encrypt_blocks");
663 &mov ($inp,&wparam(0));
664 &mov ($out,&wparam(1));
665 &mov ($len,&wparam(2));
666 &mov ($key,&wparam(3));
667 &mov ($rounds_,&wparam(4));
668 &mov ($rounds,&wparam(5));
671 &and ("esp",-16); # align stack
672 &mov (&DWP(48,"esp"),$key_);
674 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
675 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
676 &mov ($rounds,&DWP(240,$key));
678 # compose byte-swap control mask for pshufb on stack
679 &mov (&DWP(0,"esp"),0x0c0d0e0f);
680 &mov (&DWP(4,"esp"),0x08090a0b);
681 &mov (&DWP(8,"esp"),0x04050607);
682 &mov (&DWP(12,"esp"),0x00010203);
684 # compose counter increment vector on stack
687 &mov (&DWP(16,"esp"),$rounds_);
688 &mov (&DWP(20,"esp"),$key_);
689 &mov (&DWP(24,"esp"),$key_);
690 &mov (&DWP(28,"esp"),$key_);
694 &lea ($key_,&DWP(0,$key));
695 &movdqa ($inout3,&QWP(0,"esp"));
696 &movdqa ($inout0,$ivec);
697 &lea ($key,&DWP(32,$key,$rounds));
698 &sub ($rounds_,$rounds);
699 &pshufb ($ivec,$inout3);
701 &set_label("ccm64_enc_outer");
702 &$movekey ($rndkey0,&QWP(0,$key_));
703 &mov ($rounds,$rounds_);
704 &movups ($in0,&QWP(0,$inp));
706 &xorps ($inout0,$rndkey0);
707 &$movekey ($rndkey1,&QWP(16,$key_));
708 &xorps ($rndkey0,$in0);
709 &xorps ($cmac,$rndkey0); # cmac^=inp
710 &$movekey ($rndkey0,&QWP(32,$key_));
712 &set_label("ccm64_enc2_loop");
713 &aesenc ($inout0,$rndkey1);
714 &aesenc ($cmac,$rndkey1);
715 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
717 &aesenc ($inout0,$rndkey0);
718 &aesenc ($cmac,$rndkey0);
719 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
720 &jnz (&label("ccm64_enc2_loop"));
721 &aesenc ($inout0,$rndkey1);
722 &aesenc ($cmac,$rndkey1);
723 &paddq ($ivec,&QWP(16,"esp"));
725 &aesenclast ($inout0,$rndkey0);
726 &aesenclast ($cmac,$rndkey0);
728 &lea ($inp,&DWP(16,$inp));
729 &xorps ($in0,$inout0); # inp^=E(ivec)
730 &movdqa ($inout0,$ivec);
731 &movups (&QWP(0,$out),$in0); # save output
732 &pshufb ($inout0,$inout3);
733 &lea ($out,&DWP(16,$out));
734 &jnz (&label("ccm64_enc_outer"));
736 &mov ("esp",&DWP(48,"esp"));
737 &mov ($out,&wparam(5));
738 &movups (&QWP(0,$out),$cmac);
740 &pxor ("xmm0","xmm0"); # clear register bank
741 &pxor ("xmm1","xmm1");
742 &pxor ("xmm2","xmm2");
743 &pxor ("xmm3","xmm3");
744 &pxor ("xmm4","xmm4");
745 &pxor ("xmm5","xmm5");
746 &pxor ("xmm6","xmm6");
747 &pxor ("xmm7","xmm7");
748 &function_end("aesni_ccm64_encrypt_blocks");
750 &function_begin("aesni_ccm64_decrypt_blocks");
751 &mov ($inp,&wparam(0));
752 &mov ($out,&wparam(1));
753 &mov ($len,&wparam(2));
754 &mov ($key,&wparam(3));
755 &mov ($rounds_,&wparam(4));
756 &mov ($rounds,&wparam(5));
759 &and ("esp",-16); # align stack
760 &mov (&DWP(48,"esp"),$key_);
762 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
763 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
764 &mov ($rounds,&DWP(240,$key));
766 # compose byte-swap control mask for pshufb on stack
767 &mov (&DWP(0,"esp"),0x0c0d0e0f);
768 &mov (&DWP(4,"esp"),0x08090a0b);
769 &mov (&DWP(8,"esp"),0x04050607);
770 &mov (&DWP(12,"esp"),0x00010203);
772 # compose counter increment vector on stack
775 &mov (&DWP(16,"esp"),$rounds_);
776 &mov (&DWP(20,"esp"),$key_);
777 &mov (&DWP(24,"esp"),$key_);
778 &mov (&DWP(28,"esp"),$key_);
780 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
781 &movdqa ($inout0,$ivec);
784 &mov ($rounds_,$rounds);
786 &pshufb ($ivec,$inout3);
788 { &aesni_inline_generate1("enc"); }
790 { &call ("_aesni_encrypt1"); }
793 &movups ($in0,&QWP(0,$inp)); # load inp
794 &paddq ($ivec,&QWP(16,"esp"));
795 &lea ($inp,&QWP(16,$inp));
796 &sub ($rounds,$rounds_);
797 &lea ($key,&DWP(32,$key_,$rounds_));
798 &mov ($rounds_,$rounds);
799 &jmp (&label("ccm64_dec_outer"));
801 &set_label("ccm64_dec_outer",16);
802 &xorps ($in0,$inout0); # inp ^= E(ivec)
803 &movdqa ($inout0,$ivec);
804 &movups (&QWP(0,$out),$in0); # save output
805 &lea ($out,&DWP(16,$out));
806 &pshufb ($inout0,$inout3);
809 &jz (&label("ccm64_dec_break"));
811 &$movekey ($rndkey0,&QWP(0,$key_));
812 &mov ($rounds,$rounds_);
813 &$movekey ($rndkey1,&QWP(16,$key_));
814 &xorps ($in0,$rndkey0);
815 &xorps ($inout0,$rndkey0);
816 &xorps ($cmac,$in0); # cmac^=out
817 &$movekey ($rndkey0,&QWP(32,$key_));
819 &set_label("ccm64_dec2_loop");
820 &aesenc ($inout0,$rndkey1);
821 &aesenc ($cmac,$rndkey1);
822 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
824 &aesenc ($inout0,$rndkey0);
825 &aesenc ($cmac,$rndkey0);
826 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
827 &jnz (&label("ccm64_dec2_loop"));
828 &movups ($in0,&QWP(0,$inp)); # load inp
829 &paddq ($ivec,&QWP(16,"esp"));
830 &aesenc ($inout0,$rndkey1);
831 &aesenc ($cmac,$rndkey1);
832 &aesenclast ($inout0,$rndkey0);
833 &aesenclast ($cmac,$rndkey0);
834 &lea ($inp,&QWP(16,$inp));
835 &jmp (&label("ccm64_dec_outer"));
837 &set_label("ccm64_dec_break",16);
838 &mov ($rounds,&DWP(240,$key_));
841 { &aesni_inline_generate1("enc",$cmac,$in0); }
843 { &call ("_aesni_encrypt1",$cmac); }
845 &mov ("esp",&DWP(48,"esp"));
846 &mov ($out,&wparam(5));
847 &movups (&QWP(0,$out),$cmac);
849 &pxor ("xmm0","xmm0"); # clear register bank
850 &pxor ("xmm1","xmm1");
851 &pxor ("xmm2","xmm2");
852 &pxor ("xmm3","xmm3");
853 &pxor ("xmm4","xmm4");
854 &pxor ("xmm5","xmm5");
855 &pxor ("xmm6","xmm6");
856 &pxor ("xmm7","xmm7");
857 &function_end("aesni_ccm64_decrypt_blocks");
860 ######################################################################
861 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
862 # size_t blocks, const AES_KEY *key,
865 # Handles only complete blocks, operates on 32-bit counter and
866 # does not update *ivec! (see crypto/modes/ctr128.c for details)
870 # 16 vector addend: 0,6,6,6
871 # 32 counter-less ivec
872 # 48 1st triplet of counter vector
873 # 64 2nd triplet of counter vector
876 &function_begin("aesni_ctr32_encrypt_blocks");
877 &mov ($inp,&wparam(0));
878 &mov ($out,&wparam(1));
879 &mov ($len,&wparam(2));
880 &mov ($key,&wparam(3));
881 &mov ($rounds_,&wparam(4));
884 &and ("esp",-16); # align stack
885 &mov (&DWP(80,"esp"),$key_);
888 &je (&label("ctr32_one_shortcut"));
890 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
892 # compose byte-swap control mask for pshufb on stack
893 &mov (&DWP(0,"esp"),0x0c0d0e0f);
894 &mov (&DWP(4,"esp"),0x08090a0b);
895 &mov (&DWP(8,"esp"),0x04050607);
896 &mov (&DWP(12,"esp"),0x00010203);
898 # compose counter increment vector on stack
901 &mov (&DWP(16,"esp"),$rounds);
902 &mov (&DWP(20,"esp"),$rounds);
903 &mov (&DWP(24,"esp"),$rounds);
904 &mov (&DWP(28,"esp"),$key_);
906 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
907 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
909 &mov ($rounds,&DWP(240,$key)); # key->rounds
911 # compose 2 vectors of 3x32-bit counters
913 &pxor ($rndkey0,$rndkey0);
914 &pxor ($rndkey1,$rndkey1);
915 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
916 &pinsrd ($rndkey0,$rounds_,0);
917 &lea ($key_,&DWP(3,$rounds_));
918 &pinsrd ($rndkey1,$key_,0);
920 &pinsrd ($rndkey0,$rounds_,1);
922 &pinsrd ($rndkey1,$key_,1);
924 &pinsrd ($rndkey0,$rounds_,2);
926 &pinsrd ($rndkey1,$key_,2);
927 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
928 &pshufb ($rndkey0,$inout0); # byte swap
929 &movdqu ($inout4,&QWP(0,$key)); # key[0]
930 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
931 &pshufb ($rndkey1,$inout0); # byte swap
933 &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword
934 &pshufd ($inout1,$rndkey0,2<<6);
936 &jb (&label("ctr32_tail"));
937 &pxor ($inout5,$inout4); # counter-less ivec^key[0]
940 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0]
941 &mov ($key_,$key); # backup $key
942 &sub ($rounds_,$rounds); # backup twisted $rounds
943 &lea ($key,&DWP(32,$key,$rounds));
945 &jmp (&label("ctr32_loop6"));
947 &set_label("ctr32_loop6",16);
948 # inlining _aesni_encrypt6's prologue gives ~6% improvement...
949 &pshufd ($inout2,$rndkey0,1<<6);
950 &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec
951 &pshufd ($inout3,$rndkey1,3<<6);
952 &pxor ($inout0,$rndkey0); # merge counter-less ivec
953 &pshufd ($inout4,$rndkey1,2<<6);
954 &pxor ($inout1,$rndkey0);
955 &pshufd ($inout5,$rndkey1,1<<6);
956 &$movekey ($rndkey1,&QWP(16,$key_));
957 &pxor ($inout2,$rndkey0);
958 &pxor ($inout3,$rndkey0);
959 &aesenc ($inout0,$rndkey1);
960 &pxor ($inout4,$rndkey0);
961 &pxor ($inout5,$rndkey0);
962 &aesenc ($inout1,$rndkey1);
963 &$movekey ($rndkey0,&QWP(32,$key_));
964 &mov ($rounds,$rounds_);
965 &aesenc ($inout2,$rndkey1);
966 &aesenc ($inout3,$rndkey1);
967 &aesenc ($inout4,$rndkey1);
968 &aesenc ($inout5,$rndkey1);
970 &call (&label("_aesni_encrypt6_enter"));
972 &movups ($rndkey1,&QWP(0,$inp));
973 &movups ($rndkey0,&QWP(0x10,$inp));
974 &xorps ($inout0,$rndkey1);
975 &movups ($rndkey1,&QWP(0x20,$inp));
976 &xorps ($inout1,$rndkey0);
977 &movups (&QWP(0,$out),$inout0);
978 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
979 &xorps ($inout2,$rndkey1);
980 &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet
981 &movups (&QWP(0x10,$out),$inout1);
982 &movups (&QWP(0x20,$out),$inout2);
984 &paddd ($rndkey1,$rndkey0); # 2nd triplet increment
985 &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment
986 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
988 &movups ($inout1,&QWP(0x30,$inp));
989 &movups ($inout2,&QWP(0x40,$inp));
990 &xorps ($inout3,$inout1);
991 &movups ($inout1,&QWP(0x50,$inp));
992 &lea ($inp,&DWP(0x60,$inp));
993 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
994 &pshufb ($rndkey0,$inout0); # byte swap
995 &xorps ($inout4,$inout2);
996 &movups (&QWP(0x30,$out),$inout3);
997 &xorps ($inout5,$inout1);
998 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
999 &pshufb ($rndkey1,$inout0); # byte swap
1000 &movups (&QWP(0x40,$out),$inout4);
1001 &pshufd ($inout0,$rndkey0,3<<6);
1002 &movups (&QWP(0x50,$out),$inout5);
1003 &lea ($out,&DWP(0x60,$out));
1005 &pshufd ($inout1,$rndkey0,2<<6);
1007 &jnc (&label("ctr32_loop6"));
1010 &jz (&label("ctr32_ret"));
1011 &movdqu ($inout5,&QWP(0,$key_));
1013 &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec
1014 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
1016 &set_label("ctr32_tail");
1017 &por ($inout0,$inout5);
1019 &jb (&label("ctr32_one"));
1021 &pshufd ($inout2,$rndkey0,1<<6);
1022 &por ($inout1,$inout5);
1023 &je (&label("ctr32_two"));
1025 &pshufd ($inout3,$rndkey1,3<<6);
1026 &por ($inout2,$inout5);
1028 &jb (&label("ctr32_three"));
1030 &pshufd ($inout4,$rndkey1,2<<6);
1031 &por ($inout3,$inout5);
1032 &je (&label("ctr32_four"));
1034 &por ($inout4,$inout5);
1035 &call ("_aesni_encrypt6");
1036 &movups ($rndkey1,&QWP(0,$inp));
1037 &movups ($rndkey0,&QWP(0x10,$inp));
1038 &xorps ($inout0,$rndkey1);
1039 &movups ($rndkey1,&QWP(0x20,$inp));
1040 &xorps ($inout1,$rndkey0);
1041 &movups ($rndkey0,&QWP(0x30,$inp));
1042 &xorps ($inout2,$rndkey1);
1043 &movups ($rndkey1,&QWP(0x40,$inp));
1044 &xorps ($inout3,$rndkey0);
1045 &movups (&QWP(0,$out),$inout0);
1046 &xorps ($inout4,$rndkey1);
1047 &movups (&QWP(0x10,$out),$inout1);
1048 &movups (&QWP(0x20,$out),$inout2);
1049 &movups (&QWP(0x30,$out),$inout3);
1050 &movups (&QWP(0x40,$out),$inout4);
1051 &jmp (&label("ctr32_ret"));
1053 &set_label("ctr32_one_shortcut",16);
1054 &movups ($inout0,&QWP(0,$rounds_)); # load ivec
1055 &mov ($rounds,&DWP(240,$key));
1057 &set_label("ctr32_one");
1059 { &aesni_inline_generate1("enc"); }
1061 { &call ("_aesni_encrypt1"); }
1062 &movups ($in0,&QWP(0,$inp));
1063 &xorps ($in0,$inout0);
1064 &movups (&QWP(0,$out),$in0);
1065 &jmp (&label("ctr32_ret"));
1067 &set_label("ctr32_two",16);
1068 &call ("_aesni_encrypt2");
1069 &movups ($inout3,&QWP(0,$inp));
1070 &movups ($inout4,&QWP(0x10,$inp));
1071 &xorps ($inout0,$inout3);
1072 &xorps ($inout1,$inout4);
1073 &movups (&QWP(0,$out),$inout0);
1074 &movups (&QWP(0x10,$out),$inout1);
1075 &jmp (&label("ctr32_ret"));
1077 &set_label("ctr32_three",16);
1078 &call ("_aesni_encrypt3");
1079 &movups ($inout3,&QWP(0,$inp));
1080 &movups ($inout4,&QWP(0x10,$inp));
1081 &xorps ($inout0,$inout3);
1082 &movups ($inout5,&QWP(0x20,$inp));
1083 &xorps ($inout1,$inout4);
1084 &movups (&QWP(0,$out),$inout0);
1085 &xorps ($inout2,$inout5);
1086 &movups (&QWP(0x10,$out),$inout1);
1087 &movups (&QWP(0x20,$out),$inout2);
1088 &jmp (&label("ctr32_ret"));
1090 &set_label("ctr32_four",16);
1091 &call ("_aesni_encrypt4");
1092 &movups ($inout4,&QWP(0,$inp));
1093 &movups ($inout5,&QWP(0x10,$inp));
1094 &movups ($rndkey1,&QWP(0x20,$inp));
1095 &xorps ($inout0,$inout4);
1096 &movups ($rndkey0,&QWP(0x30,$inp));
1097 &xorps ($inout1,$inout5);
1098 &movups (&QWP(0,$out),$inout0);
1099 &xorps ($inout2,$rndkey1);
1100 &movups (&QWP(0x10,$out),$inout1);
1101 &xorps ($inout3,$rndkey0);
1102 &movups (&QWP(0x20,$out),$inout2);
1103 &movups (&QWP(0x30,$out),$inout3);
1105 &set_label("ctr32_ret");
1106 &pxor ("xmm0","xmm0"); # clear register bank
1107 &pxor ("xmm1","xmm1");
1108 &pxor ("xmm2","xmm2");
1109 &pxor ("xmm3","xmm3");
1110 &pxor ("xmm4","xmm4");
1111 &movdqa (&QWP(32,"esp"),"xmm0"); # clear stack
1112 &pxor ("xmm5","xmm5");
1113 &movdqa (&QWP(48,"esp"),"xmm0");
1114 &pxor ("xmm6","xmm6");
1115 &movdqa (&QWP(64,"esp"),"xmm0");
1116 &pxor ("xmm7","xmm7");
1117 &mov ("esp",&DWP(80,"esp"));
1118 &function_end("aesni_ctr32_encrypt_blocks");
1120 ######################################################################
1121 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1122 # const AES_KEY *key1, const AES_KEY *key2
1123 # const unsigned char iv[16]);
1125 { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1127 &function_begin("aesni_xts_encrypt");
1128 &mov ($key,&wparam(4)); # key2
1129 &mov ($inp,&wparam(5)); # clear-text tweak
1131 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1132 &movups ($inout0,&QWP(0,$inp));
1134 { &aesni_inline_generate1("enc"); }
1136 { &call ("_aesni_encrypt1"); }
1138 &mov ($inp,&wparam(0));
1139 &mov ($out,&wparam(1));
1140 &mov ($len,&wparam(2));
1141 &mov ($key,&wparam(3)); # key1
1144 &sub ("esp",16*7+8);
1145 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1146 &and ("esp",-16); # align stack
1148 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1149 &mov (&DWP(16*6+4,"esp"),0);
1150 &mov (&DWP(16*6+8,"esp"),1);
1151 &mov (&DWP(16*6+12,"esp"),0);
1152 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1153 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1155 &movdqa ($tweak,$inout0);
1156 &pxor ($twtmp,$twtmp);
1157 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1158 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1161 &mov ($key_,$key); # backup $key
1162 &mov ($rounds_,$rounds); # backup $rounds
1164 &jc (&label("xts_enc_short"));
1168 &sub ($rounds_,$rounds);
1169 &lea ($key,&DWP(32,$key,$rounds));
1170 &jmp (&label("xts_enc_loop6"));
1172 &set_label("xts_enc_loop6",16);
1173 for ($i=0;$i<4;$i++) {
1174 &pshufd ($twres,$twtmp,0x13);
1175 &pxor ($twtmp,$twtmp);
1176 &movdqa (&QWP(16*$i,"esp"),$tweak);
1177 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1178 &pand ($twres,$twmask); # isolate carry and residue
1179 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1180 &pxor ($tweak,$twres);
1182 &pshufd ($inout5,$twtmp,0x13);
1183 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1184 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1185 &$movekey ($rndkey0,&QWP(0,$key_));
1186 &pand ($inout5,$twmask); # isolate carry and residue
1187 &movups ($inout0,&QWP(0,$inp)); # load input
1188 &pxor ($inout5,$tweak);
1190 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1191 &mov ($rounds,$rounds_); # restore $rounds
1192 &movdqu ($inout1,&QWP(16*1,$inp));
1193 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1194 &movdqu ($inout2,&QWP(16*2,$inp));
1195 &pxor ($inout1,$rndkey0);
1196 &movdqu ($inout3,&QWP(16*3,$inp));
1197 &pxor ($inout2,$rndkey0);
1198 &movdqu ($inout4,&QWP(16*4,$inp));
1199 &pxor ($inout3,$rndkey0);
1200 &movdqu ($rndkey1,&QWP(16*5,$inp));
1201 &pxor ($inout4,$rndkey0);
1202 &lea ($inp,&DWP(16*6,$inp));
1203 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1204 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1205 &pxor ($inout5,$rndkey1);
1207 &$movekey ($rndkey1,&QWP(16,$key_));
1208 &pxor ($inout1,&QWP(16*1,"esp"));
1209 &pxor ($inout2,&QWP(16*2,"esp"));
1210 &aesenc ($inout0,$rndkey1);
1211 &pxor ($inout3,&QWP(16*3,"esp"));
1212 &pxor ($inout4,&QWP(16*4,"esp"));
1213 &aesenc ($inout1,$rndkey1);
1214 &pxor ($inout5,$rndkey0);
1215 &$movekey ($rndkey0,&QWP(32,$key_));
1216 &aesenc ($inout2,$rndkey1);
1217 &aesenc ($inout3,$rndkey1);
1218 &aesenc ($inout4,$rndkey1);
1219 &aesenc ($inout5,$rndkey1);
1220 &call (&label("_aesni_encrypt6_enter"));
1222 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1223 &pxor ($twtmp,$twtmp);
1224 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1225 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1226 &xorps ($inout1,&QWP(16*1,"esp"));
1227 &movups (&QWP(16*0,$out),$inout0); # write output
1228 &xorps ($inout2,&QWP(16*2,"esp"));
1229 &movups (&QWP(16*1,$out),$inout1);
1230 &xorps ($inout3,&QWP(16*3,"esp"));
1231 &movups (&QWP(16*2,$out),$inout2);
1232 &xorps ($inout4,&QWP(16*4,"esp"));
1233 &movups (&QWP(16*3,$out),$inout3);
1234 &xorps ($inout5,$tweak);
1235 &movups (&QWP(16*4,$out),$inout4);
1236 &pshufd ($twres,$twtmp,0x13);
1237 &movups (&QWP(16*5,$out),$inout5);
1238 &lea ($out,&DWP(16*6,$out));
1239 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1241 &pxor ($twtmp,$twtmp);
1242 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1243 &pand ($twres,$twmask); # isolate carry and residue
1244 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1245 &pxor ($tweak,$twres);
1248 &jnc (&label("xts_enc_loop6"));
1250 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
1251 &mov ($key,$key_); # restore $key
1252 &mov ($rounds_,$rounds);
1254 &set_label("xts_enc_short");
1256 &jz (&label("xts_enc_done6x"));
1258 &movdqa ($inout3,$tweak); # put aside previous tweak
1260 &jb (&label("xts_enc_one"));
1262 &pshufd ($twres,$twtmp,0x13);
1263 &pxor ($twtmp,$twtmp);
1264 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1265 &pand ($twres,$twmask); # isolate carry and residue
1266 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1267 &pxor ($tweak,$twres);
1268 &je (&label("xts_enc_two"));
1270 &pshufd ($twres,$twtmp,0x13);
1271 &pxor ($twtmp,$twtmp);
1272 &movdqa ($inout4,$tweak); # put aside previous tweak
1273 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1274 &pand ($twres,$twmask); # isolate carry and residue
1275 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1276 &pxor ($tweak,$twres);
1278 &jb (&label("xts_enc_three"));
1280 &pshufd ($twres,$twtmp,0x13);
1281 &pxor ($twtmp,$twtmp);
1282 &movdqa ($inout5,$tweak); # put aside previous tweak
1283 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1284 &pand ($twres,$twmask); # isolate carry and residue
1285 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1286 &pxor ($tweak,$twres);
1287 &movdqa (&QWP(16*0,"esp"),$inout3);
1288 &movdqa (&QWP(16*1,"esp"),$inout4);
1289 &je (&label("xts_enc_four"));
1291 &movdqa (&QWP(16*2,"esp"),$inout5);
1292 &pshufd ($inout5,$twtmp,0x13);
1293 &movdqa (&QWP(16*3,"esp"),$tweak);
1294 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1295 &pand ($inout5,$twmask); # isolate carry and residue
1296 &pxor ($inout5,$tweak);
1298 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1299 &movdqu ($inout1,&QWP(16*1,$inp));
1300 &movdqu ($inout2,&QWP(16*2,$inp));
1301 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1302 &movdqu ($inout3,&QWP(16*3,$inp));
1303 &pxor ($inout1,&QWP(16*1,"esp"));
1304 &movdqu ($inout4,&QWP(16*4,$inp));
1305 &pxor ($inout2,&QWP(16*2,"esp"));
1306 &lea ($inp,&DWP(16*5,$inp));
1307 &pxor ($inout3,&QWP(16*3,"esp"));
1308 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1309 &pxor ($inout4,$inout5);
1311 &call ("_aesni_encrypt6");
1313 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1314 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1315 &xorps ($inout1,&QWP(16*1,"esp"));
1316 &xorps ($inout2,&QWP(16*2,"esp"));
1317 &movups (&QWP(16*0,$out),$inout0); # write output
1318 &xorps ($inout3,&QWP(16*3,"esp"));
1319 &movups (&QWP(16*1,$out),$inout1);
1320 &xorps ($inout4,$tweak);
1321 &movups (&QWP(16*2,$out),$inout2);
1322 &movups (&QWP(16*3,$out),$inout3);
1323 &movups (&QWP(16*4,$out),$inout4);
1324 &lea ($out,&DWP(16*5,$out));
1325 &jmp (&label("xts_enc_done"));
1327 &set_label("xts_enc_one",16);
1328 &movups ($inout0,&QWP(16*0,$inp)); # load input
1329 &lea ($inp,&DWP(16*1,$inp));
1330 &xorps ($inout0,$inout3); # input^=tweak
1332 { &aesni_inline_generate1("enc"); }
1334 { &call ("_aesni_encrypt1"); }
1335 &xorps ($inout0,$inout3); # output^=tweak
1336 &movups (&QWP(16*0,$out),$inout0); # write output
1337 &lea ($out,&DWP(16*1,$out));
1339 &movdqa ($tweak,$inout3); # last tweak
1340 &jmp (&label("xts_enc_done"));
1342 &set_label("xts_enc_two",16);
1343 &movaps ($inout4,$tweak); # put aside last tweak
1345 &movups ($inout0,&QWP(16*0,$inp)); # load input
1346 &movups ($inout1,&QWP(16*1,$inp));
1347 &lea ($inp,&DWP(16*2,$inp));
1348 &xorps ($inout0,$inout3); # input^=tweak
1349 &xorps ($inout1,$inout4);
1351 &call ("_aesni_encrypt2");
1353 &xorps ($inout0,$inout3); # output^=tweak
1354 &xorps ($inout1,$inout4);
1355 &movups (&QWP(16*0,$out),$inout0); # write output
1356 &movups (&QWP(16*1,$out),$inout1);
1357 &lea ($out,&DWP(16*2,$out));
1359 &movdqa ($tweak,$inout4); # last tweak
1360 &jmp (&label("xts_enc_done"));
1362 &set_label("xts_enc_three",16);
1363 &movaps ($inout5,$tweak); # put aside last tweak
1364 &movups ($inout0,&QWP(16*0,$inp)); # load input
1365 &movups ($inout1,&QWP(16*1,$inp));
1366 &movups ($inout2,&QWP(16*2,$inp));
1367 &lea ($inp,&DWP(16*3,$inp));
1368 &xorps ($inout0,$inout3); # input^=tweak
1369 &xorps ($inout1,$inout4);
1370 &xorps ($inout2,$inout5);
1372 &call ("_aesni_encrypt3");
1374 &xorps ($inout0,$inout3); # output^=tweak
1375 &xorps ($inout1,$inout4);
1376 &xorps ($inout2,$inout5);
1377 &movups (&QWP(16*0,$out),$inout0); # write output
1378 &movups (&QWP(16*1,$out),$inout1);
1379 &movups (&QWP(16*2,$out),$inout2);
1380 &lea ($out,&DWP(16*3,$out));
1382 &movdqa ($tweak,$inout5); # last tweak
1383 &jmp (&label("xts_enc_done"));
1385 &set_label("xts_enc_four",16);
1386 &movaps ($inout4,$tweak); # put aside last tweak
1388 &movups ($inout0,&QWP(16*0,$inp)); # load input
1389 &movups ($inout1,&QWP(16*1,$inp));
1390 &movups ($inout2,&QWP(16*2,$inp));
1391 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1392 &movups ($inout3,&QWP(16*3,$inp));
1393 &lea ($inp,&DWP(16*4,$inp));
1394 &xorps ($inout1,&QWP(16*1,"esp"));
1395 &xorps ($inout2,$inout5);
1396 &xorps ($inout3,$inout4);
1398 &call ("_aesni_encrypt4");
1400 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1401 &xorps ($inout1,&QWP(16*1,"esp"));
1402 &xorps ($inout2,$inout5);
1403 &movups (&QWP(16*0,$out),$inout0); # write output
1404 &xorps ($inout3,$inout4);
1405 &movups (&QWP(16*1,$out),$inout1);
1406 &movups (&QWP(16*2,$out),$inout2);
1407 &movups (&QWP(16*3,$out),$inout3);
1408 &lea ($out,&DWP(16*4,$out));
1410 &movdqa ($tweak,$inout4); # last tweak
1411 &jmp (&label("xts_enc_done"));
1413 &set_label("xts_enc_done6x",16); # $tweak is pre-calculated
1414 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1416 &jz (&label("xts_enc_ret"));
1417 &movdqa ($inout3,$tweak);
1418 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1419 &jmp (&label("xts_enc_steal"));
1421 &set_label("xts_enc_done",16);
1422 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1423 &pxor ($twtmp,$twtmp);
1425 &jz (&label("xts_enc_ret"));
1427 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1428 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1429 &pshufd ($inout3,$twtmp,0x13);
1430 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1431 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
1432 &pxor ($inout3,$tweak);
1434 &set_label("xts_enc_steal");
1435 &movz ($rounds,&BP(0,$inp));
1436 &movz ($key,&BP(-16,$out));
1437 &lea ($inp,&DWP(1,$inp));
1438 &mov (&BP(-16,$out),&LB($rounds));
1439 &mov (&BP(0,$out),&LB($key));
1440 &lea ($out,&DWP(1,$out));
1442 &jnz (&label("xts_enc_steal"));
1444 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1445 &mov ($key,$key_); # restore $key
1446 &mov ($rounds,$rounds_); # restore $rounds
1448 &movups ($inout0,&QWP(-16,$out)); # load input
1449 &xorps ($inout0,$inout3); # input^=tweak
1451 { &aesni_inline_generate1("enc"); }
1453 { &call ("_aesni_encrypt1"); }
1454 &xorps ($inout0,$inout3); # output^=tweak
1455 &movups (&QWP(-16,$out),$inout0); # write output
1457 &set_label("xts_enc_ret");
1458 &pxor ("xmm0","xmm0"); # clear register bank
1459 &pxor ("xmm1","xmm1");
1460 &pxor ("xmm2","xmm2");
1461 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
1462 &pxor ("xmm3","xmm3");
1463 &movdqa (&QWP(16*1,"esp"),"xmm0");
1464 &pxor ("xmm4","xmm4");
1465 &movdqa (&QWP(16*2,"esp"),"xmm0");
1466 &pxor ("xmm5","xmm5");
1467 &movdqa (&QWP(16*3,"esp"),"xmm0");
1468 &pxor ("xmm6","xmm6");
1469 &movdqa (&QWP(16*4,"esp"),"xmm0");
1470 &pxor ("xmm7","xmm7");
1471 &movdqa (&QWP(16*5,"esp"),"xmm0");
1472 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1473 &function_end("aesni_xts_encrypt");
1475 &function_begin("aesni_xts_decrypt");
1476 &mov ($key,&wparam(4)); # key2
1477 &mov ($inp,&wparam(5)); # clear-text tweak
1479 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1480 &movups ($inout0,&QWP(0,$inp));
1482 { &aesni_inline_generate1("enc"); }
1484 { &call ("_aesni_encrypt1"); }
1486 &mov ($inp,&wparam(0));
1487 &mov ($out,&wparam(1));
1488 &mov ($len,&wparam(2));
1489 &mov ($key,&wparam(3)); # key1
1492 &sub ("esp",16*7+8);
1493 &and ("esp",-16); # align stack
1495 &xor ($rounds_,$rounds_); # if(len%16) len-=16;
1497 &setnz (&LB($rounds_));
1499 &sub ($len,$rounds_);
1501 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1502 &mov (&DWP(16*6+4,"esp"),0);
1503 &mov (&DWP(16*6+8,"esp"),1);
1504 &mov (&DWP(16*6+12,"esp"),0);
1505 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1506 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1508 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1509 &mov ($key_,$key); # backup $key
1510 &mov ($rounds_,$rounds); # backup $rounds
1512 &movdqa ($tweak,$inout0);
1513 &pxor ($twtmp,$twtmp);
1514 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1515 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1519 &jc (&label("xts_dec_short"));
1523 &sub ($rounds_,$rounds);
1524 &lea ($key,&DWP(32,$key,$rounds));
1525 &jmp (&label("xts_dec_loop6"));
1527 &set_label("xts_dec_loop6",16);
1528 for ($i=0;$i<4;$i++) {
1529 &pshufd ($twres,$twtmp,0x13);
1530 &pxor ($twtmp,$twtmp);
1531 &movdqa (&QWP(16*$i,"esp"),$tweak);
1532 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1533 &pand ($twres,$twmask); # isolate carry and residue
1534 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1535 &pxor ($tweak,$twres);
1537 &pshufd ($inout5,$twtmp,0x13);
1538 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1539 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1540 &$movekey ($rndkey0,&QWP(0,$key_));
1541 &pand ($inout5,$twmask); # isolate carry and residue
1542 &movups ($inout0,&QWP(0,$inp)); # load input
1543 &pxor ($inout5,$tweak);
1545 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1546 &mov ($rounds,$rounds_);
1547 &movdqu ($inout1,&QWP(16*1,$inp));
1548 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1549 &movdqu ($inout2,&QWP(16*2,$inp));
1550 &pxor ($inout1,$rndkey0);
1551 &movdqu ($inout3,&QWP(16*3,$inp));
1552 &pxor ($inout2,$rndkey0);
1553 &movdqu ($inout4,&QWP(16*4,$inp));
1554 &pxor ($inout3,$rndkey0);
1555 &movdqu ($rndkey1,&QWP(16*5,$inp));
1556 &pxor ($inout4,$rndkey0);
1557 &lea ($inp,&DWP(16*6,$inp));
1558 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1559 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1560 &pxor ($inout5,$rndkey1);
1562 &$movekey ($rndkey1,&QWP(16,$key_));
1563 &pxor ($inout1,&QWP(16*1,"esp"));
1564 &pxor ($inout2,&QWP(16*2,"esp"));
1565 &aesdec ($inout0,$rndkey1);
1566 &pxor ($inout3,&QWP(16*3,"esp"));
1567 &pxor ($inout4,&QWP(16*4,"esp"));
1568 &aesdec ($inout1,$rndkey1);
1569 &pxor ($inout5,$rndkey0);
1570 &$movekey ($rndkey0,&QWP(32,$key_));
1571 &aesdec ($inout2,$rndkey1);
1572 &aesdec ($inout3,$rndkey1);
1573 &aesdec ($inout4,$rndkey1);
1574 &aesdec ($inout5,$rndkey1);
1575 &call (&label("_aesni_decrypt6_enter"));
1577 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1578 &pxor ($twtmp,$twtmp);
1579 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1580 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1581 &xorps ($inout1,&QWP(16*1,"esp"));
1582 &movups (&QWP(16*0,$out),$inout0); # write output
1583 &xorps ($inout2,&QWP(16*2,"esp"));
1584 &movups (&QWP(16*1,$out),$inout1);
1585 &xorps ($inout3,&QWP(16*3,"esp"));
1586 &movups (&QWP(16*2,$out),$inout2);
1587 &xorps ($inout4,&QWP(16*4,"esp"));
1588 &movups (&QWP(16*3,$out),$inout3);
1589 &xorps ($inout5,$tweak);
1590 &movups (&QWP(16*4,$out),$inout4);
1591 &pshufd ($twres,$twtmp,0x13);
1592 &movups (&QWP(16*5,$out),$inout5);
1593 &lea ($out,&DWP(16*6,$out));
1594 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1596 &pxor ($twtmp,$twtmp);
1597 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1598 &pand ($twres,$twmask); # isolate carry and residue
1599 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1600 &pxor ($tweak,$twres);
1603 &jnc (&label("xts_dec_loop6"));
1605 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
1606 &mov ($key,$key_); # restore $key
1607 &mov ($rounds_,$rounds);
1609 &set_label("xts_dec_short");
1611 &jz (&label("xts_dec_done6x"));
1613 &movdqa ($inout3,$tweak); # put aside previous tweak
1615 &jb (&label("xts_dec_one"));
1617 &pshufd ($twres,$twtmp,0x13);
1618 &pxor ($twtmp,$twtmp);
1619 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1620 &pand ($twres,$twmask); # isolate carry and residue
1621 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1622 &pxor ($tweak,$twres);
1623 &je (&label("xts_dec_two"));
1625 &pshufd ($twres,$twtmp,0x13);
1626 &pxor ($twtmp,$twtmp);
1627 &movdqa ($inout4,$tweak); # put aside previous tweak
1628 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1629 &pand ($twres,$twmask); # isolate carry and residue
1630 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1631 &pxor ($tweak,$twres);
1633 &jb (&label("xts_dec_three"));
1635 &pshufd ($twres,$twtmp,0x13);
1636 &pxor ($twtmp,$twtmp);
1637 &movdqa ($inout5,$tweak); # put aside previous tweak
1638 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1639 &pand ($twres,$twmask); # isolate carry and residue
1640 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1641 &pxor ($tweak,$twres);
1642 &movdqa (&QWP(16*0,"esp"),$inout3);
1643 &movdqa (&QWP(16*1,"esp"),$inout4);
1644 &je (&label("xts_dec_four"));
1646 &movdqa (&QWP(16*2,"esp"),$inout5);
1647 &pshufd ($inout5,$twtmp,0x13);
1648 &movdqa (&QWP(16*3,"esp"),$tweak);
1649 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1650 &pand ($inout5,$twmask); # isolate carry and residue
1651 &pxor ($inout5,$tweak);
1653 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1654 &movdqu ($inout1,&QWP(16*1,$inp));
1655 &movdqu ($inout2,&QWP(16*2,$inp));
1656 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1657 &movdqu ($inout3,&QWP(16*3,$inp));
1658 &pxor ($inout1,&QWP(16*1,"esp"));
1659 &movdqu ($inout4,&QWP(16*4,$inp));
1660 &pxor ($inout2,&QWP(16*2,"esp"));
1661 &lea ($inp,&DWP(16*5,$inp));
1662 &pxor ($inout3,&QWP(16*3,"esp"));
1663 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1664 &pxor ($inout4,$inout5);
1666 &call ("_aesni_decrypt6");
1668 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1669 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1670 &xorps ($inout1,&QWP(16*1,"esp"));
1671 &xorps ($inout2,&QWP(16*2,"esp"));
1672 &movups (&QWP(16*0,$out),$inout0); # write output
1673 &xorps ($inout3,&QWP(16*3,"esp"));
1674 &movups (&QWP(16*1,$out),$inout1);
1675 &xorps ($inout4,$tweak);
1676 &movups (&QWP(16*2,$out),$inout2);
1677 &movups (&QWP(16*3,$out),$inout3);
1678 &movups (&QWP(16*4,$out),$inout4);
1679 &lea ($out,&DWP(16*5,$out));
1680 &jmp (&label("xts_dec_done"));
1682 &set_label("xts_dec_one",16);
1683 &movups ($inout0,&QWP(16*0,$inp)); # load input
1684 &lea ($inp,&DWP(16*1,$inp));
1685 &xorps ($inout0,$inout3); # input^=tweak
1687 { &aesni_inline_generate1("dec"); }
1689 { &call ("_aesni_decrypt1"); }
1690 &xorps ($inout0,$inout3); # output^=tweak
1691 &movups (&QWP(16*0,$out),$inout0); # write output
1692 &lea ($out,&DWP(16*1,$out));
1694 &movdqa ($tweak,$inout3); # last tweak
1695 &jmp (&label("xts_dec_done"));
1697 &set_label("xts_dec_two",16);
1698 &movaps ($inout4,$tweak); # put aside last tweak
1700 &movups ($inout0,&QWP(16*0,$inp)); # load input
1701 &movups ($inout1,&QWP(16*1,$inp));
1702 &lea ($inp,&DWP(16*2,$inp));
1703 &xorps ($inout0,$inout3); # input^=tweak
1704 &xorps ($inout1,$inout4);
1706 &call ("_aesni_decrypt2");
1708 &xorps ($inout0,$inout3); # output^=tweak
1709 &xorps ($inout1,$inout4);
1710 &movups (&QWP(16*0,$out),$inout0); # write output
1711 &movups (&QWP(16*1,$out),$inout1);
1712 &lea ($out,&DWP(16*2,$out));
1714 &movdqa ($tweak,$inout4); # last tweak
1715 &jmp (&label("xts_dec_done"));
1717 &set_label("xts_dec_three",16);
1718 &movaps ($inout5,$tweak); # put aside last tweak
1719 &movups ($inout0,&QWP(16*0,$inp)); # load input
1720 &movups ($inout1,&QWP(16*1,$inp));
1721 &movups ($inout2,&QWP(16*2,$inp));
1722 &lea ($inp,&DWP(16*3,$inp));
1723 &xorps ($inout0,$inout3); # input^=tweak
1724 &xorps ($inout1,$inout4);
1725 &xorps ($inout2,$inout5);
1727 &call ("_aesni_decrypt3");
1729 &xorps ($inout0,$inout3); # output^=tweak
1730 &xorps ($inout1,$inout4);
1731 &xorps ($inout2,$inout5);
1732 &movups (&QWP(16*0,$out),$inout0); # write output
1733 &movups (&QWP(16*1,$out),$inout1);
1734 &movups (&QWP(16*2,$out),$inout2);
1735 &lea ($out,&DWP(16*3,$out));
1737 &movdqa ($tweak,$inout5); # last tweak
1738 &jmp (&label("xts_dec_done"));
1740 &set_label("xts_dec_four",16);
1741 &movaps ($inout4,$tweak); # put aside last tweak
1743 &movups ($inout0,&QWP(16*0,$inp)); # load input
1744 &movups ($inout1,&QWP(16*1,$inp));
1745 &movups ($inout2,&QWP(16*2,$inp));
1746 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1747 &movups ($inout3,&QWP(16*3,$inp));
1748 &lea ($inp,&DWP(16*4,$inp));
1749 &xorps ($inout1,&QWP(16*1,"esp"));
1750 &xorps ($inout2,$inout5);
1751 &xorps ($inout3,$inout4);
1753 &call ("_aesni_decrypt4");
1755 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1756 &xorps ($inout1,&QWP(16*1,"esp"));
1757 &xorps ($inout2,$inout5);
1758 &movups (&QWP(16*0,$out),$inout0); # write output
1759 &xorps ($inout3,$inout4);
1760 &movups (&QWP(16*1,$out),$inout1);
1761 &movups (&QWP(16*2,$out),$inout2);
1762 &movups (&QWP(16*3,$out),$inout3);
1763 &lea ($out,&DWP(16*4,$out));
1765 &movdqa ($tweak,$inout4); # last tweak
1766 &jmp (&label("xts_dec_done"));
1768 &set_label("xts_dec_done6x",16); # $tweak is pre-calculated
1769 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1771 &jz (&label("xts_dec_ret"));
1772 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1773 &jmp (&label("xts_dec_only_one_more"));
1775 &set_label("xts_dec_done",16);
1776 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1777 &pxor ($twtmp,$twtmp);
1779 &jz (&label("xts_dec_ret"));
1781 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1782 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1783 &pshufd ($twres,$twtmp,0x13);
1784 &pxor ($twtmp,$twtmp);
1785 &movdqa ($twmask,&QWP(16*6,"esp"));
1786 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1787 &pand ($twres,$twmask); # isolate carry and residue
1788 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1789 &pxor ($tweak,$twres);
1791 &set_label("xts_dec_only_one_more");
1792 &pshufd ($inout3,$twtmp,0x13);
1793 &movdqa ($inout4,$tweak); # put aside previous tweak
1794 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1795 &pand ($inout3,$twmask); # isolate carry and residue
1796 &pxor ($inout3,$tweak);
1798 &mov ($key,$key_); # restore $key
1799 &mov ($rounds,$rounds_); # restore $rounds
1801 &movups ($inout0,&QWP(0,$inp)); # load input
1802 &xorps ($inout0,$inout3); # input^=tweak
1804 { &aesni_inline_generate1("dec"); }
1806 { &call ("_aesni_decrypt1"); }
1807 &xorps ($inout0,$inout3); # output^=tweak
1808 &movups (&QWP(0,$out),$inout0); # write output
1810 &set_label("xts_dec_steal");
1811 &movz ($rounds,&BP(16,$inp));
1812 &movz ($key,&BP(0,$out));
1813 &lea ($inp,&DWP(1,$inp));
1814 &mov (&BP(0,$out),&LB($rounds));
1815 &mov (&BP(16,$out),&LB($key));
1816 &lea ($out,&DWP(1,$out));
1818 &jnz (&label("xts_dec_steal"));
1820 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1821 &mov ($key,$key_); # restore $key
1822 &mov ($rounds,$rounds_); # restore $rounds
1824 &movups ($inout0,&QWP(0,$out)); # load input
1825 &xorps ($inout0,$inout4); # input^=tweak
1827 { &aesni_inline_generate1("dec"); }
1829 { &call ("_aesni_decrypt1"); }
1830 &xorps ($inout0,$inout4); # output^=tweak
1831 &movups (&QWP(0,$out),$inout0); # write output
1833 &set_label("xts_dec_ret");
1834 &pxor ("xmm0","xmm0"); # clear register bank
1835 &pxor ("xmm1","xmm1");
1836 &pxor ("xmm2","xmm2");
1837 &movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
1838 &pxor ("xmm3","xmm3");
1839 &movdqa (&QWP(16*1,"esp"),"xmm0");
1840 &pxor ("xmm4","xmm4");
1841 &movdqa (&QWP(16*2,"esp"),"xmm0");
1842 &pxor ("xmm5","xmm5");
1843 &movdqa (&QWP(16*3,"esp"),"xmm0");
1844 &pxor ("xmm6","xmm6");
1845 &movdqa (&QWP(16*4,"esp"),"xmm0");
1846 &pxor ("xmm7","xmm7");
1847 &movdqa (&QWP(16*5,"esp"),"xmm0");
1848 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1849 &function_end("aesni_xts_decrypt");
1852 ######################################################################
1853 # void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
1854 # const AES_KEY *key, unsigned int start_block_num,
1855 # unsigned char offset_i[16], const unsigned char L_[][16],
1856 # unsigned char checksum[16]);
1859 # offsets within stack frame
1860 my $checksum = 16*6;
1861 my ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4));
1863 # reassigned registers
1864 my ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out);
1865 # $l_, $blocks, $inp, $key are permanently allocated in registers;
1866 # remaining non-volatile ones are offloaded to stack, which even
1867 # stay invariant after written to stack.
1869 &function_begin("aesni_ocb_encrypt");
1870 &mov ($rounds,&wparam(5)); # &offset_i
1871 &mov ($rounds_,&wparam(7)); # &checksum
1873 &mov ($inp,&wparam(0));
1874 &mov ($out,&wparam(1));
1875 &mov ($len,&wparam(2));
1876 &mov ($key,&wparam(3));
1877 &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i
1878 &mov ($block,&wparam(4)); # start_block_num
1879 &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum
1880 &mov ($l_,&wparam(6)); # L_
1882 &mov ($rounds,"esp");
1883 &sub ("esp",$esp_off+4); # alloca
1884 &and ("esp",-16); # align stack
1888 &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6
1889 &mov (&DWP($out_off,"esp"),$out);
1890 &mov (&DWP($end_off,"esp"),$len);
1891 &mov (&DWP($esp_off,"esp"),$rounds);
1893 &mov ($rounds,&DWP(240,$key));
1896 &jnz (&label("odd"));
1901 &movdqu ($inout5,&QWP(0,$l_,$i3));
1902 &mov ($i3,$key); # put aside key
1904 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1905 &lea ($inp,&DWP(16,$inp));
1907 &pxor ($inout5,$rndkey0); # ^ last offset_i
1908 &pxor ($rndkey1,$inout0); # checksum
1909 &pxor ($inout0,$inout5); # ^ offset_i
1911 &movdqa ($inout4,$rndkey1);
1913 { &aesni_inline_generate1("enc"); }
1915 { &call ("_aesni_encrypt1"); }
1917 &xorps ($inout0,$inout5); # ^ offset_i
1918 &movdqa ($rndkey0,$inout5); # pass last offset_i
1919 &movdqa ($rndkey1,$inout4); # pass the checksum
1921 &movups (&QWP(-16,$out,$inp),$inout0); # store output
1923 &mov ($rounds,&DWP(240,$i3));
1924 &mov ($key,$i3); # restore key
1925 &mov ($len,&DWP($end_off,"esp"));
1930 &sub ($out,$rounds); # twisted rounds
1931 &mov (&DWP($key_off,"esp"),$key);
1932 &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule
1933 &mov (&DWP($rounds_off,"esp"),$out);
1936 &ja (&label("short"));
1937 &jmp (&label("grandloop"));
1939 &set_label("grandloop",32);
1940 &lea ($i1,&DWP(1,$block));
1941 &lea ($i3,&DWP(3,$block));
1942 &lea ($i5,&DWP(5,$block));
1950 &movdqu ($inout0,&QWP(0,$l_));
1951 &movdqu ($inout1,&QWP(0,$l_,$i1));
1952 &mov ($rounds,&DWP($rounds_off,"esp"));
1953 &movdqa ($inout2,$inout0);
1954 &movdqu ($inout3,&QWP(0,$l_,$i3));
1955 &movdqa ($inout4,$inout0);
1956 &movdqu ($inout5,&QWP(0,$l_,$i5));
1958 &pxor ($inout0,$rndkey0); # ^ last offset_i
1959 &pxor ($inout1,$inout0);
1960 &movdqa (&QWP(16*0,"esp"),$inout0);
1961 &pxor ($inout2,$inout1);
1962 &movdqa (&QWP(16*1,"esp"),$inout1);
1963 &pxor ($inout3,$inout2);
1964 &movdqa (&QWP(16*2,"esp"),$inout2);
1965 &pxor ($inout4,$inout3);
1966 &movdqa (&QWP(16*3,"esp"),$inout3);
1967 &pxor ($inout5,$inout4);
1968 &movdqa (&QWP(16*4,"esp"),$inout4);
1969 &movdqa (&QWP(16*5,"esp"),$inout5);
1971 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
1972 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1973 &movdqu ($inout1,&QWP(16*1,$inp));
1974 &movdqu ($inout2,&QWP(16*2,$inp));
1975 &movdqu ($inout3,&QWP(16*3,$inp));
1976 &movdqu ($inout4,&QWP(16*4,$inp));
1977 &movdqu ($inout5,&QWP(16*5,$inp));
1978 &lea ($inp,&DWP(16*6,$inp));
1980 &pxor ($rndkey1,$inout0); # checksum
1981 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
1982 &pxor ($rndkey1,$inout1);
1983 &pxor ($inout1,$rndkey0);
1984 &pxor ($rndkey1,$inout2);
1985 &pxor ($inout2,$rndkey0);
1986 &pxor ($rndkey1,$inout3);
1987 &pxor ($inout3,$rndkey0);
1988 &pxor ($rndkey1,$inout4);
1989 &pxor ($inout4,$rndkey0);
1990 &pxor ($rndkey1,$inout5);
1991 &pxor ($inout5,$rndkey0);
1992 &movdqa (&QWP($checksum,"esp"),$rndkey1);
1994 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
1995 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
1996 &pxor ($inout1,&QWP(16*1,"esp"));
1997 &pxor ($inout2,&QWP(16*2,"esp"));
1998 &pxor ($inout3,&QWP(16*3,"esp"));
1999 &pxor ($inout4,&QWP(16*4,"esp"));
2000 &pxor ($inout5,&QWP(16*5,"esp"));
2002 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2003 &aesenc ($inout0,$rndkey1);
2004 &aesenc ($inout1,$rndkey1);
2005 &aesenc ($inout2,$rndkey1);
2006 &aesenc ($inout3,$rndkey1);
2007 &aesenc ($inout4,$rndkey1);
2008 &aesenc ($inout5,$rndkey1);
2010 &mov ($out,&DWP($out_off,"esp"));
2011 &mov ($len,&DWP($end_off,"esp"));
2012 &call ("_aesni_encrypt6_enter");
2014 &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i
2015 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2016 &pxor ($inout1,&QWP(16*1,"esp"));
2017 &pxor ($inout2,&QWP(16*2,"esp"));
2018 &pxor ($inout3,&QWP(16*3,"esp"));
2019 &pxor ($inout4,&QWP(16*4,"esp"));
2020 &pxor ($inout5,$rndkey0);
2021 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2023 &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output
2024 &movdqu (&QWP(-16*5,$out,$inp),$inout1);
2025 &movdqu (&QWP(-16*4,$out,$inp),$inout2);
2026 &movdqu (&QWP(-16*3,$out,$inp),$inout3);
2027 &movdqu (&QWP(-16*2,$out,$inp),$inout4);
2028 &movdqu (&QWP(-16*1,$out,$inp),$inout5);
2029 &cmp ($inp,$len); # done yet?
2030 &jb (&label("grandloop"));
2032 &set_label("short");
2035 &jz (&label("done"));
2038 &jb (&label("one"));
2039 &je (&label("two"));
2042 &jb (&label("three"));
2043 &je (&label("four"));
2045 &lea ($i1,&DWP(1,$block));
2046 &lea ($i3,&DWP(3,$block));
2051 &movdqu ($inout0,&QWP(0,$l_));
2052 &movdqu ($inout1,&QWP(0,$l_,$i1));
2053 &mov ($rounds,&DWP($rounds_off,"esp"));
2054 &movdqa ($inout2,$inout0);
2055 &movdqu ($inout3,&QWP(0,$l_,$i3));
2056 &movdqa ($inout4,$inout0);
2058 &pxor ($inout0,$rndkey0); # ^ last offset_i
2059 &pxor ($inout1,$inout0);
2060 &movdqa (&QWP(16*0,"esp"),$inout0);
2061 &pxor ($inout2,$inout1);
2062 &movdqa (&QWP(16*1,"esp"),$inout1);
2063 &pxor ($inout3,$inout2);
2064 &movdqa (&QWP(16*2,"esp"),$inout2);
2065 &pxor ($inout4,$inout3);
2066 &movdqa (&QWP(16*3,"esp"),$inout3);
2067 &pxor ($inout5,$inout4);
2068 &movdqa (&QWP(16*4,"esp"),$inout4);
2070 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
2071 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2072 &movdqu ($inout1,&QWP(16*1,$inp));
2073 &movdqu ($inout2,&QWP(16*2,$inp));
2074 &movdqu ($inout3,&QWP(16*3,$inp));
2075 &movdqu ($inout4,&QWP(16*4,$inp));
2076 &pxor ($inout5,$inout5);
2078 &pxor ($rndkey1,$inout0); # checksum
2079 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
2080 &pxor ($rndkey1,$inout1);
2081 &pxor ($inout1,$rndkey0);
2082 &pxor ($rndkey1,$inout2);
2083 &pxor ($inout2,$rndkey0);
2084 &pxor ($rndkey1,$inout3);
2085 &pxor ($inout3,$rndkey0);
2086 &pxor ($rndkey1,$inout4);
2087 &pxor ($inout4,$rndkey0);
2088 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2090 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
2091 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2092 &pxor ($inout1,&QWP(16*1,"esp"));
2093 &pxor ($inout2,&QWP(16*2,"esp"));
2094 &pxor ($inout3,&QWP(16*3,"esp"));
2095 &pxor ($inout4,&QWP(16*4,"esp"));
2097 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2098 &aesenc ($inout0,$rndkey1);
2099 &aesenc ($inout1,$rndkey1);
2100 &aesenc ($inout2,$rndkey1);
2101 &aesenc ($inout3,$rndkey1);
2102 &aesenc ($inout4,$rndkey1);
2103 &aesenc ($inout5,$rndkey1);
2105 &mov ($out,&DWP($out_off,"esp"));
2106 &call ("_aesni_encrypt6_enter");
2108 &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i
2109 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2110 &pxor ($inout1,&QWP(16*1,"esp"));
2111 &pxor ($inout2,&QWP(16*2,"esp"));
2112 &pxor ($inout3,&QWP(16*3,"esp"));
2113 &pxor ($inout4,$rndkey0);
2114 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2116 &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output
2117 &movdqu (&QWP(16*1,$out,$inp),$inout1);
2118 &movdqu (&QWP(16*2,$out,$inp),$inout2);
2119 &movdqu (&QWP(16*3,$out,$inp),$inout3);
2120 &movdqu (&QWP(16*4,$out,$inp),$inout4);
2122 &jmp (&label("done"));
2124 &set_label("one",16);
2125 &movdqu ($inout5,&QWP(0,$l_));
2126 &mov ($key,&DWP($key_off,"esp")); # restore key
2128 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2129 &mov ($rounds,&DWP(240,$key));
2131 &pxor ($inout5,$rndkey0); # ^ last offset_i
2132 &pxor ($rndkey1,$inout0); # checksum
2133 &pxor ($inout0,$inout5); # ^ offset_i
2135 &movdqa ($inout4,$rndkey1);
2136 &mov ($out,&DWP($out_off,"esp"));
2138 { &aesni_inline_generate1("enc"); }
2140 { &call ("_aesni_encrypt1"); }
2142 &xorps ($inout0,$inout5); # ^ offset_i
2143 &movdqa ($rndkey0,$inout5); # pass last offset_i
2144 &movdqa ($rndkey1,$inout4); # pass the checksum
2145 &movups (&QWP(0,$out,$inp),$inout0);
2147 &jmp (&label("done"));
2149 &set_label("two",16);
2150 &lea ($i1,&DWP(1,$block));
2151 &mov ($key,&DWP($key_off,"esp")); # restore key
2154 &movdqu ($inout4,&QWP(0,$l_));
2155 &movdqu ($inout5,&QWP(0,$l_,$i1));
2157 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2158 &movdqu ($inout1,&QWP(16*1,$inp));
2159 &mov ($rounds,&DWP(240,$key));
2161 &pxor ($inout4,$rndkey0); # ^ last offset_i
2162 &pxor ($inout5,$inout4);
2164 &pxor ($rndkey1,$inout0); # checksum
2165 &pxor ($inout0,$inout4); # ^ offset_i
2166 &pxor ($rndkey1,$inout1);
2167 &pxor ($inout1,$inout5);
2169 &movdqa ($inout3,$rndkey1)
2170 &mov ($out,&DWP($out_off,"esp"));
2171 &call ("_aesni_encrypt2");
2173 &xorps ($inout0,$inout4); # ^ offset_i
2174 &xorps ($inout1,$inout5);
2175 &movdqa ($rndkey0,$inout5); # pass last offset_i
2176 &movdqa ($rndkey1,$inout3); # pass the checksum
2177 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2178 &movups (&QWP(16*1,$out,$inp),$inout1);
2180 &jmp (&label("done"));
2182 &set_label("three",16);
2183 &lea ($i1,&DWP(1,$block));
2184 &mov ($key,&DWP($key_off,"esp")); # restore key
2187 &movdqu ($inout3,&QWP(0,$l_));
2188 &movdqu ($inout4,&QWP(0,$l_,$i1));
2189 &movdqa ($inout5,$inout3);
2191 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2192 &movdqu ($inout1,&QWP(16*1,$inp));
2193 &movdqu ($inout2,&QWP(16*2,$inp));
2194 &mov ($rounds,&DWP(240,$key));
2196 &pxor ($inout3,$rndkey0); # ^ last offset_i
2197 &pxor ($inout4,$inout3);
2198 &pxor ($inout5,$inout4);
2200 &pxor ($rndkey1,$inout0); # checksum
2201 &pxor ($inout0,$inout3); # ^ offset_i
2202 &pxor ($rndkey1,$inout1);
2203 &pxor ($inout1,$inout4);
2204 &pxor ($rndkey1,$inout2);
2205 &pxor ($inout2,$inout5);
2207 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2208 &mov ($out,&DWP($out_off,"esp"));
2209 &call ("_aesni_encrypt3");
2211 &xorps ($inout0,$inout3); # ^ offset_i
2212 &xorps ($inout1,$inout4);
2213 &xorps ($inout2,$inout5);
2214 &movdqa ($rndkey0,$inout5); # pass last offset_i
2215 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2216 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2217 &movups (&QWP(16*1,$out,$inp),$inout1);
2218 &movups (&QWP(16*2,$out,$inp),$inout2);
2220 &jmp (&label("done"));
2222 &set_label("four",16);
2223 &lea ($i1,&DWP(1,$block));
2224 &lea ($i3,&DWP(3,$block));
2227 &mov ($key,&DWP($key_off,"esp")); # restore key
2230 &movdqu ($inout2,&QWP(0,$l_));
2231 &movdqu ($inout3,&QWP(0,$l_,$i1));
2232 &movdqa ($inout4,$inout2);
2233 &movdqu ($inout5,&QWP(0,$l_,$i3));
2235 &pxor ($inout2,$rndkey0); # ^ last offset_i
2236 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2237 &pxor ($inout3,$inout2);
2238 &movdqu ($inout1,&QWP(16*1,$inp));
2239 &pxor ($inout4,$inout3);
2240 &movdqa (&QWP(16*0,"esp"),$inout2);
2241 &pxor ($inout5,$inout4);
2242 &movdqa (&QWP(16*1,"esp"),$inout3);
2243 &movdqu ($inout2,&QWP(16*2,$inp));
2244 &movdqu ($inout3,&QWP(16*3,$inp));
2245 &mov ($rounds,&DWP(240,$key));
2247 &pxor ($rndkey1,$inout0); # checksum
2248 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2249 &pxor ($rndkey1,$inout1);
2250 &pxor ($inout1,&QWP(16*1,"esp"));
2251 &pxor ($rndkey1,$inout2);
2252 &pxor ($inout2,$inout4);
2253 &pxor ($rndkey1,$inout3);
2254 &pxor ($inout3,$inout5);
2256 &movdqa (&QWP($checksum,"esp"),$rndkey1)
2257 &mov ($out,&DWP($out_off,"esp"));
2258 &call ("_aesni_encrypt4");
2260 &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2261 &xorps ($inout1,&QWP(16*1,"esp"));
2262 &xorps ($inout2,$inout4);
2263 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2264 &xorps ($inout3,$inout5);
2265 &movups (&QWP(16*1,$out,$inp),$inout1);
2266 &movdqa ($rndkey0,$inout5); # pass last offset_i
2267 &movups (&QWP(16*2,$out,$inp),$inout2);
2268 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2269 &movups (&QWP(16*3,$out,$inp),$inout3);
2272 &mov ($key,&DWP($esp_off,"esp"));
2273 &pxor ($inout0,$inout0); # clear register bank
2274 &pxor ($inout1,$inout1);
2275 &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack
2276 &pxor ($inout2,$inout2);
2277 &movdqa (&QWP(16*1,"esp"),$inout0);
2278 &pxor ($inout3,$inout3);
2279 &movdqa (&QWP(16*2,"esp"),$inout0);
2280 &pxor ($inout4,$inout4);
2281 &movdqa (&QWP(16*3,"esp"),$inout0);
2282 &pxor ($inout5,$inout5);
2283 &movdqa (&QWP(16*4,"esp"),$inout0);
2284 &movdqa (&QWP(16*5,"esp"),$inout0);
2285 &movdqa (&QWP(16*6,"esp"),$inout0);
2287 &lea ("esp",&DWP(0,$key));
2288 &mov ($rounds,&wparam(5)); # &offset_i
2289 &mov ($rounds_,&wparam(7)); # &checksum
2290 &movdqu (&QWP(0,$rounds),$rndkey0);
2291 &pxor ($rndkey0,$rndkey0);
2292 &movdqu (&QWP(0,$rounds_),$rndkey1);
2293 &pxor ($rndkey1,$rndkey1);
2294 &function_end("aesni_ocb_encrypt");
2296 &function_begin("aesni_ocb_decrypt");
2297 &mov ($rounds,&wparam(5)); # &offset_i
2298 &mov ($rounds_,&wparam(7)); # &checksum
2300 &mov ($inp,&wparam(0));
2301 &mov ($out,&wparam(1));
2302 &mov ($len,&wparam(2));
2303 &mov ($key,&wparam(3));
2304 &movdqu ($rndkey0,&QWP(0,$rounds)); # load offset_i
2305 &mov ($block,&wparam(4)); # start_block_num
2306 &movdqu ($rndkey1,&QWP(0,$rounds_)); # load checksum
2307 &mov ($l_,&wparam(6)); # L_
2309 &mov ($rounds,"esp");
2310 &sub ("esp",$esp_off+4); # alloca
2311 &and ("esp",-16); # align stack
2315 &lea ($len,&DWP(-16*6,$inp,$len)); # end of input - 16*6
2316 &mov (&DWP($out_off,"esp"),$out);
2317 &mov (&DWP($end_off,"esp"),$len);
2318 &mov (&DWP($esp_off,"esp"),$rounds);
2320 &mov ($rounds,&DWP(240,$key));
2323 &jnz (&label("odd"));
2328 &movdqu ($inout5,&QWP(0,$l_,$i3));
2329 &mov ($i3,$key); # put aside key
2331 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2332 &lea ($inp,&DWP(16,$inp));
2334 &pxor ($inout5,$rndkey0); # ^ last offset_i
2335 &pxor ($inout0,$inout5); # ^ offset_i
2337 &movdqa ($inout4,$rndkey1);
2339 { &aesni_inline_generate1("dec"); }
2341 { &call ("_aesni_decrypt1"); }
2343 &xorps ($inout0,$inout5); # ^ offset_i
2344 &movaps ($rndkey1,$inout4); # pass the checksum
2345 &movdqa ($rndkey0,$inout5); # pass last offset_i
2346 &xorps ($rndkey1,$inout0); # checksum
2347 &movups (&QWP(-16,$out,$inp),$inout0); # store output
2349 &mov ($rounds,&DWP(240,$i3));
2350 &mov ($key,$i3); # restore key
2351 &mov ($len,&DWP($end_off,"esp"));
2356 &sub ($out,$rounds); # twisted rounds
2357 &mov (&DWP($key_off,"esp"),$key);
2358 &lea ($key,&DWP(32,$key,$rounds)); # end of key schedule
2359 &mov (&DWP($rounds_off,"esp"),$out);
2362 &ja (&label("short"));
2363 &jmp (&label("grandloop"));
2365 &set_label("grandloop",32);
2366 &lea ($i1,&DWP(1,$block));
2367 &lea ($i3,&DWP(3,$block));
2368 &lea ($i5,&DWP(5,$block));
2376 &movdqu ($inout0,&QWP(0,$l_));
2377 &movdqu ($inout1,&QWP(0,$l_,$i1));
2378 &mov ($rounds,&DWP($rounds_off,"esp"));
2379 &movdqa ($inout2,$inout0);
2380 &movdqu ($inout3,&QWP(0,$l_,$i3));
2381 &movdqa ($inout4,$inout0);
2382 &movdqu ($inout5,&QWP(0,$l_,$i5));
2384 &pxor ($inout0,$rndkey0); # ^ last offset_i
2385 &pxor ($inout1,$inout0);
2386 &movdqa (&QWP(16*0,"esp"),$inout0);
2387 &pxor ($inout2,$inout1);
2388 &movdqa (&QWP(16*1,"esp"),$inout1);
2389 &pxor ($inout3,$inout2);
2390 &movdqa (&QWP(16*2,"esp"),$inout2);
2391 &pxor ($inout4,$inout3);
2392 &movdqa (&QWP(16*3,"esp"),$inout3);
2393 &pxor ($inout5,$inout4);
2394 &movdqa (&QWP(16*4,"esp"),$inout4);
2395 &movdqa (&QWP(16*5,"esp"),$inout5);
2397 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
2398 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2399 &movdqu ($inout1,&QWP(16*1,$inp));
2400 &movdqu ($inout2,&QWP(16*2,$inp));
2401 &movdqu ($inout3,&QWP(16*3,$inp));
2402 &movdqu ($inout4,&QWP(16*4,$inp));
2403 &movdqu ($inout5,&QWP(16*5,$inp));
2404 &lea ($inp,&DWP(16*6,$inp));
2406 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2407 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
2408 &pxor ($inout1,$rndkey0);
2409 &pxor ($inout2,$rndkey0);
2410 &pxor ($inout3,$rndkey0);
2411 &pxor ($inout4,$rndkey0);
2412 &pxor ($inout5,$rndkey0);
2414 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
2415 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2416 &pxor ($inout1,&QWP(16*1,"esp"));
2417 &pxor ($inout2,&QWP(16*2,"esp"));
2418 &pxor ($inout3,&QWP(16*3,"esp"));
2419 &pxor ($inout4,&QWP(16*4,"esp"));
2420 &pxor ($inout5,&QWP(16*5,"esp"));
2422 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2423 &aesdec ($inout0,$rndkey1);
2424 &aesdec ($inout1,$rndkey1);
2425 &aesdec ($inout2,$rndkey1);
2426 &aesdec ($inout3,$rndkey1);
2427 &aesdec ($inout4,$rndkey1);
2428 &aesdec ($inout5,$rndkey1);
2430 &mov ($out,&DWP($out_off,"esp"));
2431 &mov ($len,&DWP($end_off,"esp"));
2432 &call ("_aesni_decrypt6_enter");
2434 &movdqa ($rndkey0,&QWP(16*5,"esp")); # pass last offset_i
2435 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2436 &movdqa ($rndkey1,&QWP($checksum,"esp"));
2437 &pxor ($inout1,&QWP(16*1,"esp"));
2438 &pxor ($inout2,&QWP(16*2,"esp"));
2439 &pxor ($inout3,&QWP(16*3,"esp"));
2440 &pxor ($inout4,&QWP(16*4,"esp"));
2441 &pxor ($inout5,$rndkey0);
2443 &pxor ($rndkey1,$inout0); # checksum
2444 &movdqu (&QWP(-16*6,$out,$inp),$inout0);# store output
2445 &pxor ($rndkey1,$inout1);
2446 &movdqu (&QWP(-16*5,$out,$inp),$inout1);
2447 &pxor ($rndkey1,$inout2);
2448 &movdqu (&QWP(-16*4,$out,$inp),$inout2);
2449 &pxor ($rndkey1,$inout3);
2450 &movdqu (&QWP(-16*3,$out,$inp),$inout3);
2451 &pxor ($rndkey1,$inout4);
2452 &movdqu (&QWP(-16*2,$out,$inp),$inout4);
2453 &pxor ($rndkey1,$inout5);
2454 &movdqu (&QWP(-16*1,$out,$inp),$inout5);
2455 &cmp ($inp,$len); # done yet?
2456 &jb (&label("grandloop"));
2458 &set_label("short");
2461 &jz (&label("done"));
2464 &jb (&label("one"));
2465 &je (&label("two"));
2468 &jb (&label("three"));
2469 &je (&label("four"));
2471 &lea ($i1,&DWP(1,$block));
2472 &lea ($i3,&DWP(3,$block));
2477 &movdqu ($inout0,&QWP(0,$l_));
2478 &movdqu ($inout1,&QWP(0,$l_,$i1));
2479 &mov ($rounds,&DWP($rounds_off,"esp"));
2480 &movdqa ($inout2,$inout0);
2481 &movdqu ($inout3,&QWP(0,$l_,$i3));
2482 &movdqa ($inout4,$inout0);
2484 &pxor ($inout0,$rndkey0); # ^ last offset_i
2485 &pxor ($inout1,$inout0);
2486 &movdqa (&QWP(16*0,"esp"),$inout0);
2487 &pxor ($inout2,$inout1);
2488 &movdqa (&QWP(16*1,"esp"),$inout1);
2489 &pxor ($inout3,$inout2);
2490 &movdqa (&QWP(16*2,"esp"),$inout2);
2491 &pxor ($inout4,$inout3);
2492 &movdqa (&QWP(16*3,"esp"),$inout3);
2493 &pxor ($inout5,$inout4);
2494 &movdqa (&QWP(16*4,"esp"),$inout4);
2496 &$movekey ($rndkey0,&QWP(-48,$key,$rounds));
2497 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2498 &movdqu ($inout1,&QWP(16*1,$inp));
2499 &movdqu ($inout2,&QWP(16*2,$inp));
2500 &movdqu ($inout3,&QWP(16*3,$inp));
2501 &movdqu ($inout4,&QWP(16*4,$inp));
2502 &pxor ($inout5,$inout5);
2504 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2505 &pxor ($inout0,$rndkey0); # ^ roundkey[0]
2506 &pxor ($inout1,$rndkey0);
2507 &pxor ($inout2,$rndkey0);
2508 &pxor ($inout3,$rndkey0);
2509 &pxor ($inout4,$rndkey0);
2511 &$movekey ($rndkey1,&QWP(-32,$key,$rounds));
2512 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2513 &pxor ($inout1,&QWP(16*1,"esp"));
2514 &pxor ($inout2,&QWP(16*2,"esp"));
2515 &pxor ($inout3,&QWP(16*3,"esp"));
2516 &pxor ($inout4,&QWP(16*4,"esp"));
2518 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
2519 &aesdec ($inout0,$rndkey1);
2520 &aesdec ($inout1,$rndkey1);
2521 &aesdec ($inout2,$rndkey1);
2522 &aesdec ($inout3,$rndkey1);
2523 &aesdec ($inout4,$rndkey1);
2524 &aesdec ($inout5,$rndkey1);
2526 &mov ($out,&DWP($out_off,"esp"));
2527 &call ("_aesni_decrypt6_enter");
2529 &movdqa ($rndkey0,&QWP(16*4,"esp")); # pass last offset_i
2530 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2531 &movdqa ($rndkey1,&QWP($checksum,"esp"));
2532 &pxor ($inout1,&QWP(16*1,"esp"));
2533 &pxor ($inout2,&QWP(16*2,"esp"));
2534 &pxor ($inout3,&QWP(16*3,"esp"));
2535 &pxor ($inout4,$rndkey0);
2537 &pxor ($rndkey1,$inout0); # checksum
2538 &movdqu (&QWP(16*0,$out,$inp),$inout0); # store output
2539 &pxor ($rndkey1,$inout1);
2540 &movdqu (&QWP(16*1,$out,$inp),$inout1);
2541 &pxor ($rndkey1,$inout2);
2542 &movdqu (&QWP(16*2,$out,$inp),$inout2);
2543 &pxor ($rndkey1,$inout3);
2544 &movdqu (&QWP(16*3,$out,$inp),$inout3);
2545 &pxor ($rndkey1,$inout4);
2546 &movdqu (&QWP(16*4,$out,$inp),$inout4);
2548 &jmp (&label("done"));
2550 &set_label("one",16);
2551 &movdqu ($inout5,&QWP(0,$l_));
2552 &mov ($key,&DWP($key_off,"esp")); # restore key
2554 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2555 &mov ($rounds,&DWP(240,$key));
2557 &pxor ($inout5,$rndkey0); # ^ last offset_i
2558 &pxor ($inout0,$inout5); # ^ offset_i
2560 &movdqa ($inout4,$rndkey1);
2561 &mov ($out,&DWP($out_off,"esp"));
2563 { &aesni_inline_generate1("dec"); }
2565 { &call ("_aesni_decrypt1"); }
2567 &xorps ($inout0,$inout5); # ^ offset_i
2568 &movaps ($rndkey1,$inout4); # pass the checksum
2569 &movdqa ($rndkey0,$inout5); # pass last offset_i
2570 &xorps ($rndkey1,$inout0); # checksum
2571 &movups (&QWP(0,$out,$inp),$inout0);
2573 &jmp (&label("done"));
2575 &set_label("two",16);
2576 &lea ($i1,&DWP(1,$block));
2577 &mov ($key,&DWP($key_off,"esp")); # restore key
2580 &movdqu ($inout4,&QWP(0,$l_));
2581 &movdqu ($inout5,&QWP(0,$l_,$i1));
2583 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2584 &movdqu ($inout1,&QWP(16*1,$inp));
2585 &mov ($rounds,&DWP(240,$key));
2587 &movdqa ($inout3,$rndkey1);
2588 &pxor ($inout4,$rndkey0); # ^ last offset_i
2589 &pxor ($inout5,$inout4);
2591 &pxor ($inout0,$inout4); # ^ offset_i
2592 &pxor ($inout1,$inout5);
2594 &mov ($out,&DWP($out_off,"esp"));
2595 &call ("_aesni_decrypt2");
2597 &xorps ($inout0,$inout4); # ^ offset_i
2598 &xorps ($inout1,$inout5);
2599 &movdqa ($rndkey0,$inout5); # pass last offset_i
2600 &xorps ($inout3,$inout0); # checksum
2601 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2602 &xorps ($inout3,$inout1);
2603 &movups (&QWP(16*1,$out,$inp),$inout1);
2604 &movaps ($rndkey1,$inout3); # pass the checksum
2606 &jmp (&label("done"));
2608 &set_label("three",16);
2609 &lea ($i1,&DWP(1,$block));
2610 &mov ($key,&DWP($key_off,"esp")); # restore key
2613 &movdqu ($inout3,&QWP(0,$l_));
2614 &movdqu ($inout4,&QWP(0,$l_,$i1));
2615 &movdqa ($inout5,$inout3);
2617 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2618 &movdqu ($inout1,&QWP(16*1,$inp));
2619 &movdqu ($inout2,&QWP(16*2,$inp));
2620 &mov ($rounds,&DWP(240,$key));
2622 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2623 &pxor ($inout3,$rndkey0); # ^ last offset_i
2624 &pxor ($inout4,$inout3);
2625 &pxor ($inout5,$inout4);
2627 &pxor ($inout0,$inout3); # ^ offset_i
2628 &pxor ($inout1,$inout4);
2629 &pxor ($inout2,$inout5);
2631 &mov ($out,&DWP($out_off,"esp"));
2632 &call ("_aesni_decrypt3");
2634 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2635 &xorps ($inout0,$inout3); # ^ offset_i
2636 &xorps ($inout1,$inout4);
2637 &xorps ($inout2,$inout5);
2638 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2639 &pxor ($rndkey1,$inout0); # checksum
2640 &movdqa ($rndkey0,$inout5); # pass last offset_i
2641 &movups (&QWP(16*1,$out,$inp),$inout1);
2642 &pxor ($rndkey1,$inout1);
2643 &movups (&QWP(16*2,$out,$inp),$inout2);
2644 &pxor ($rndkey1,$inout2);
2646 &jmp (&label("done"));
2648 &set_label("four",16);
2649 &lea ($i1,&DWP(1,$block));
2650 &lea ($i3,&DWP(3,$block));
2653 &mov ($key,&DWP($key_off,"esp")); # restore key
2656 &movdqu ($inout2,&QWP(0,$l_));
2657 &movdqu ($inout3,&QWP(0,$l_,$i1));
2658 &movdqa ($inout4,$inout2);
2659 &movdqu ($inout5,&QWP(0,$l_,$i3));
2661 &pxor ($inout2,$rndkey0); # ^ last offset_i
2662 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
2663 &pxor ($inout3,$inout2);
2664 &movdqu ($inout1,&QWP(16*1,$inp));
2665 &pxor ($inout4,$inout3);
2666 &movdqa (&QWP(16*0,"esp"),$inout2);
2667 &pxor ($inout5,$inout4);
2668 &movdqa (&QWP(16*1,"esp"),$inout3);
2669 &movdqu ($inout2,&QWP(16*2,$inp));
2670 &movdqu ($inout3,&QWP(16*3,$inp));
2671 &mov ($rounds,&DWP(240,$key));
2673 &movdqa (&QWP($checksum,"esp"),$rndkey1);
2674 &pxor ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2675 &pxor ($inout1,&QWP(16*1,"esp"));
2676 &pxor ($inout2,$inout4);
2677 &pxor ($inout3,$inout5);
2679 &mov ($out,&DWP($out_off,"esp"));
2680 &call ("_aesni_decrypt4");
2682 &movdqa ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2683 &xorps ($inout0,&QWP(16*0,"esp")); # ^ offset_i
2684 &xorps ($inout1,&QWP(16*1,"esp"));
2685 &xorps ($inout2,$inout4);
2686 &movups (&QWP(16*0,$out,$inp),$inout0); # store output
2687 &pxor ($rndkey1,$inout0); # checksum
2688 &xorps ($inout3,$inout5);
2689 &movups (&QWP(16*1,$out,$inp),$inout1);
2690 &pxor ($rndkey1,$inout1);
2691 &movdqa ($rndkey0,$inout5); # pass last offset_i
2692 &movups (&QWP(16*2,$out,$inp),$inout2);
2693 &pxor ($rndkey1,$inout2);
2694 &movups (&QWP(16*3,$out,$inp),$inout3);
2695 &pxor ($rndkey1,$inout3);
2698 &mov ($key,&DWP($esp_off,"esp"));
2699 &pxor ($inout0,$inout0); # clear register bank
2700 &pxor ($inout1,$inout1);
2701 &movdqa (&QWP(16*0,"esp"),$inout0); # clear stack
2702 &pxor ($inout2,$inout2);
2703 &movdqa (&QWP(16*1,"esp"),$inout0);
2704 &pxor ($inout3,$inout3);
2705 &movdqa (&QWP(16*2,"esp"),$inout0);
2706 &pxor ($inout4,$inout4);
2707 &movdqa (&QWP(16*3,"esp"),$inout0);
2708 &pxor ($inout5,$inout5);
2709 &movdqa (&QWP(16*4,"esp"),$inout0);
2710 &movdqa (&QWP(16*5,"esp"),$inout0);
2711 &movdqa (&QWP(16*6,"esp"),$inout0);
2713 &lea ("esp",&DWP(0,$key));
2714 &mov ($rounds,&wparam(5)); # &offset_i
2715 &mov ($rounds_,&wparam(7)); # &checksum
2716 &movdqu (&QWP(0,$rounds),$rndkey0);
2717 &pxor ($rndkey0,$rndkey0);
2718 &movdqu (&QWP(0,$rounds_),$rndkey1);
2719 &pxor ($rndkey1,$rndkey1);
2720 &function_end("aesni_ocb_decrypt");
2724 ######################################################################
2725 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
2726 # size_t length, const AES_KEY *key,
2727 # unsigned char *ivp,const int enc);
2728 &function_begin("${PREFIX}_cbc_encrypt");
2729 &mov ($inp,&wparam(0));
2730 &mov ($rounds_,"esp");
2731 &mov ($out,&wparam(1));
2733 &mov ($len,&wparam(2));
2734 &and ($rounds_,-16);
2735 &mov ($key,&wparam(3));
2736 &mov ($key_,&wparam(4));
2738 &jz (&label("cbc_abort"));
2740 &cmp (&wparam(5),0);
2741 &xchg ($rounds_,"esp"); # alloca
2742 &movups ($ivec,&QWP(0,$key_)); # load IV
2743 &mov ($rounds,&DWP(240,$key));
2744 &mov ($key_,$key); # backup $key
2745 &mov (&DWP(16,"esp"),$rounds_); # save original %esp
2746 &mov ($rounds_,$rounds); # backup $rounds
2747 &je (&label("cbc_decrypt"));
2749 &movaps ($inout0,$ivec);
2751 &jb (&label("cbc_enc_tail"));
2753 &jmp (&label("cbc_enc_loop"));
2755 &set_label("cbc_enc_loop",16);
2756 &movups ($ivec,&QWP(0,$inp)); # input actually
2757 &lea ($inp,&DWP(16,$inp));
2759 { &aesni_inline_generate1("enc",$inout0,$ivec); }
2761 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
2762 &mov ($rounds,$rounds_); # restore $rounds
2763 &mov ($key,$key_); # restore $key
2764 &movups (&QWP(0,$out),$inout0); # store output
2765 &lea ($out,&DWP(16,$out));
2767 &jnc (&label("cbc_enc_loop"));
2769 &jnz (&label("cbc_enc_tail"));
2770 &movaps ($ivec,$inout0);
2771 &pxor ($inout0,$inout0);
2772 &jmp (&label("cbc_ret"));
2774 &set_label("cbc_enc_tail");
2775 &mov ("ecx",$len); # zaps $rounds
2776 &data_word(0xA4F3F689); # rep movsb
2777 &mov ("ecx",16); # zero tail
2779 &xor ("eax","eax"); # zaps $len
2780 &data_word(0xAAF3F689); # rep stosb
2781 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
2782 &mov ($rounds,$rounds_); # restore $rounds
2783 &mov ($inp,$out); # $inp and $out are the same
2784 &mov ($key,$key_); # restore $key
2785 &jmp (&label("cbc_enc_loop"));
2786 ######################################################################
2787 &set_label("cbc_decrypt",16);
2789 &jbe (&label("cbc_dec_tail"));
2790 &movaps (&QWP(0,"esp"),$ivec); # save IV
2792 &jmp (&label("cbc_dec_loop6_enter"));
2794 &set_label("cbc_dec_loop6",16);
2795 &movaps (&QWP(0,"esp"),$rndkey0); # save IV
2796 &movups (&QWP(0,$out),$inout5);
2797 &lea ($out,&DWP(0x10,$out));
2798 &set_label("cbc_dec_loop6_enter");
2799 &movdqu ($inout0,&QWP(0,$inp));
2800 &movdqu ($inout1,&QWP(0x10,$inp));
2801 &movdqu ($inout2,&QWP(0x20,$inp));
2802 &movdqu ($inout3,&QWP(0x30,$inp));
2803 &movdqu ($inout4,&QWP(0x40,$inp));
2804 &movdqu ($inout5,&QWP(0x50,$inp));
2806 &call ("_aesni_decrypt6");
2808 &movups ($rndkey1,&QWP(0,$inp));
2809 &movups ($rndkey0,&QWP(0x10,$inp));
2810 &xorps ($inout0,&QWP(0,"esp")); # ^=IV
2811 &xorps ($inout1,$rndkey1);
2812 &movups ($rndkey1,&QWP(0x20,$inp));
2813 &xorps ($inout2,$rndkey0);
2814 &movups ($rndkey0,&QWP(0x30,$inp));
2815 &xorps ($inout3,$rndkey1);
2816 &movups ($rndkey1,&QWP(0x40,$inp));
2817 &xorps ($inout4,$rndkey0);
2818 &movups ($rndkey0,&QWP(0x50,$inp)); # IV
2819 &xorps ($inout5,$rndkey1);
2820 &movups (&QWP(0,$out),$inout0);
2821 &movups (&QWP(0x10,$out),$inout1);
2822 &lea ($inp,&DWP(0x60,$inp));
2823 &movups (&QWP(0x20,$out),$inout2);
2824 &mov ($rounds,$rounds_); # restore $rounds
2825 &movups (&QWP(0x30,$out),$inout3);
2826 &mov ($key,$key_); # restore $key
2827 &movups (&QWP(0x40,$out),$inout4);
2828 &lea ($out,&DWP(0x50,$out));
2830 &ja (&label("cbc_dec_loop6"));
2832 &movaps ($inout0,$inout5);
2833 &movaps ($ivec,$rndkey0);
2835 &jle (&label("cbc_dec_clear_tail_collected"));
2836 &movups (&QWP(0,$out),$inout0);
2837 &lea ($out,&DWP(0x10,$out));
2838 &set_label("cbc_dec_tail");
2839 &movups ($inout0,&QWP(0,$inp));
2840 &movaps ($in0,$inout0);
2842 &jbe (&label("cbc_dec_one"));
2844 &movups ($inout1,&QWP(0x10,$inp));
2845 &movaps ($in1,$inout1);
2847 &jbe (&label("cbc_dec_two"));
2849 &movups ($inout2,&QWP(0x20,$inp));
2851 &jbe (&label("cbc_dec_three"));
2853 &movups ($inout3,&QWP(0x30,$inp));
2855 &jbe (&label("cbc_dec_four"));
2857 &movups ($inout4,&QWP(0x40,$inp));
2858 &movaps (&QWP(0,"esp"),$ivec); # save IV
2859 &movups ($inout0,&QWP(0,$inp));
2860 &xorps ($inout5,$inout5);
2861 &call ("_aesni_decrypt6");
2862 &movups ($rndkey1,&QWP(0,$inp));
2863 &movups ($rndkey0,&QWP(0x10,$inp));
2864 &xorps ($inout0,&QWP(0,"esp")); # ^= IV
2865 &xorps ($inout1,$rndkey1);
2866 &movups ($rndkey1,&QWP(0x20,$inp));
2867 &xorps ($inout2,$rndkey0);
2868 &movups ($rndkey0,&QWP(0x30,$inp));
2869 &xorps ($inout3,$rndkey1);
2870 &movups ($ivec,&QWP(0x40,$inp)); # IV
2871 &xorps ($inout4,$rndkey0);
2872 &movups (&QWP(0,$out),$inout0);
2873 &movups (&QWP(0x10,$out),$inout1);
2874 &pxor ($inout1,$inout1);
2875 &movups (&QWP(0x20,$out),$inout2);
2876 &pxor ($inout2,$inout2);
2877 &movups (&QWP(0x30,$out),$inout3);
2878 &pxor ($inout3,$inout3);
2879 &lea ($out,&DWP(0x40,$out));
2880 &movaps ($inout0,$inout4);
2881 &pxor ($inout4,$inout4);
2883 &jmp (&label("cbc_dec_tail_collected"));
2885 &set_label("cbc_dec_one",16);
2887 { &aesni_inline_generate1("dec"); }
2889 { &call ("_aesni_decrypt1"); }
2890 &xorps ($inout0,$ivec);
2891 &movaps ($ivec,$in0);
2893 &jmp (&label("cbc_dec_tail_collected"));
2895 &set_label("cbc_dec_two",16);
2896 &call ("_aesni_decrypt2");
2897 &xorps ($inout0,$ivec);
2898 &xorps ($inout1,$in0);
2899 &movups (&QWP(0,$out),$inout0);
2900 &movaps ($inout0,$inout1);
2901 &pxor ($inout1,$inout1);
2902 &lea ($out,&DWP(0x10,$out));
2903 &movaps ($ivec,$in1);
2905 &jmp (&label("cbc_dec_tail_collected"));
2907 &set_label("cbc_dec_three",16);
2908 &call ("_aesni_decrypt3");
2909 &xorps ($inout0,$ivec);
2910 &xorps ($inout1,$in0);
2911 &xorps ($inout2,$in1);
2912 &movups (&QWP(0,$out),$inout0);
2913 &movaps ($inout0,$inout2);
2914 &pxor ($inout2,$inout2);
2915 &movups (&QWP(0x10,$out),$inout1);
2916 &pxor ($inout1,$inout1);
2917 &lea ($out,&DWP(0x20,$out));
2918 &movups ($ivec,&QWP(0x20,$inp));
2920 &jmp (&label("cbc_dec_tail_collected"));
2922 &set_label("cbc_dec_four",16);
2923 &call ("_aesni_decrypt4");
2924 &movups ($rndkey1,&QWP(0x10,$inp));
2925 &movups ($rndkey0,&QWP(0x20,$inp));
2926 &xorps ($inout0,$ivec);
2927 &movups ($ivec,&QWP(0x30,$inp));
2928 &xorps ($inout1,$in0);
2929 &movups (&QWP(0,$out),$inout0);
2930 &xorps ($inout2,$rndkey1);
2931 &movups (&QWP(0x10,$out),$inout1);
2932 &pxor ($inout1,$inout1);
2933 &xorps ($inout3,$rndkey0);
2934 &movups (&QWP(0x20,$out),$inout2);
2935 &pxor ($inout2,$inout2);
2936 &lea ($out,&DWP(0x30,$out));
2937 &movaps ($inout0,$inout3);
2938 &pxor ($inout3,$inout3);
2940 &jmp (&label("cbc_dec_tail_collected"));
2942 &set_label("cbc_dec_clear_tail_collected",16);
2943 &pxor ($inout1,$inout1);
2944 &pxor ($inout2,$inout2);
2945 &pxor ($inout3,$inout3);
2946 &pxor ($inout4,$inout4);
2947 &set_label("cbc_dec_tail_collected");
2949 &jnz (&label("cbc_dec_tail_partial"));
2950 &movups (&QWP(0,$out),$inout0);
2951 &pxor ($rndkey0,$rndkey0);
2952 &jmp (&label("cbc_ret"));
2954 &set_label("cbc_dec_tail_partial",16);
2955 &movaps (&QWP(0,"esp"),$inout0);
2956 &pxor ($rndkey0,$rndkey0);
2960 &data_word(0xA4F3F689); # rep movsb
2961 &movdqa (&QWP(0,"esp"),$inout0);
2963 &set_label("cbc_ret");
2964 &mov ("esp",&DWP(16,"esp")); # pull original %esp
2965 &mov ($key_,&wparam(4));
2966 &pxor ($inout0,$inout0);
2967 &pxor ($rndkey1,$rndkey1);
2968 &movups (&QWP(0,$key_),$ivec); # output IV
2969 &pxor ($ivec,$ivec);
2970 &set_label("cbc_abort");
2971 &function_end("${PREFIX}_cbc_encrypt");
2973 ######################################################################
2974 # Mechanical port from aesni-x86_64.pl.
2976 # _aesni_set_encrypt_key is private interface,
2978 # "eax" const unsigned char *userKey
2985 &function_begin_B("_aesni_set_encrypt_key");
2988 &test ("eax","eax");
2989 &jz (&label("bad_pointer"));
2991 &jz (&label("bad_pointer"));
2993 &call (&label("pic"));
2996 &lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
2998 &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
2999 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
3000 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
3001 &mov ("ebp",&DWP(4,"ebp"));
3002 &lea ($key,&DWP(16,$key));
3003 &and ("ebp",1<<28|1<<11); # AVX and XOP bits
3005 &je (&label("14rounds"));
3007 &je (&label("12rounds"));
3009 &jne (&label("bad_keybits"));
3011 &set_label("10rounds",16);
3013 &je (&label("10rounds_alt"));
3016 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
3017 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
3018 &call (&label("key_128_cold"));
3019 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
3020 &call (&label("key_128"));
3021 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
3022 &call (&label("key_128"));
3023 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
3024 &call (&label("key_128"));
3025 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
3026 &call (&label("key_128"));
3027 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
3028 &call (&label("key_128"));
3029 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
3030 &call (&label("key_128"));
3031 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
3032 &call (&label("key_128"));
3033 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
3034 &call (&label("key_128"));
3035 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
3036 &call (&label("key_128"));
3037 &$movekey (&QWP(0,$key),"xmm0");
3038 &mov (&DWP(80,$key),$rounds);
3040 &jmp (&label("good_key"));
3042 &set_label("key_128",16);
3043 &$movekey (&QWP(0,$key),"xmm0");
3044 &lea ($key,&DWP(16,$key));
3045 &set_label("key_128_cold");
3046 &shufps ("xmm4","xmm0",0b00010000);
3047 &xorps ("xmm0","xmm4");
3048 &shufps ("xmm4","xmm0",0b10001100);
3049 &xorps ("xmm0","xmm4");
3050 &shufps ("xmm1","xmm1",0b11111111); # critical path
3051 &xorps ("xmm0","xmm1");
3054 &set_label("10rounds_alt",16);
3055 &movdqa ("xmm5",&QWP(0x00,"ebx"));
3057 &movdqa ("xmm4",&QWP(0x20,"ebx"));
3058 &movdqa ("xmm2","xmm0");
3059 &movdqu (&QWP(-16,$key),"xmm0");
3061 &set_label("loop_key128");
3062 &pshufb ("xmm0","xmm5");
3063 &aesenclast ("xmm0","xmm4");
3065 &lea ($key,&DWP(16,$key));
3067 &movdqa ("xmm3","xmm2");
3069 &pxor ("xmm3","xmm2");
3071 &pxor ("xmm3","xmm2");
3073 &pxor ("xmm2","xmm3");
3075 &pxor ("xmm0","xmm2");
3076 &movdqu (&QWP(-16,$key),"xmm0");
3077 &movdqa ("xmm2","xmm0");
3080 &jnz (&label("loop_key128"));
3082 &movdqa ("xmm4",&QWP(0x30,"ebx"));
3084 &pshufb ("xmm0","xmm5");
3085 &aesenclast ("xmm0","xmm4");
3088 &movdqa ("xmm3","xmm2");
3090 &pxor ("xmm3","xmm2");
3092 &pxor ("xmm3","xmm2");
3094 &pxor ("xmm2","xmm3");
3096 &pxor ("xmm0","xmm2");
3097 &movdqu (&QWP(0,$key),"xmm0");
3099 &movdqa ("xmm2","xmm0");
3100 &pshufb ("xmm0","xmm5");
3101 &aesenclast ("xmm0","xmm4");
3103 &movdqa ("xmm3","xmm2");
3105 &pxor ("xmm3","xmm2");
3107 &pxor ("xmm3","xmm2");
3109 &pxor ("xmm2","xmm3");
3111 &pxor ("xmm0","xmm2");
3112 &movdqu (&QWP(16,$key),"xmm0");
3115 &mov (&DWP(96,$key),$rounds);
3117 &jmp (&label("good_key"));
3119 &set_label("12rounds",16);
3120 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
3122 &je (&label("12rounds_alt"));
3125 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
3126 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
3127 &call (&label("key_192a_cold"));
3128 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
3129 &call (&label("key_192b"));
3130 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
3131 &call (&label("key_192a"));
3132 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
3133 &call (&label("key_192b"));
3134 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
3135 &call (&label("key_192a"));
3136 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
3137 &call (&label("key_192b"));
3138 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
3139 &call (&label("key_192a"));
3140 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
3141 &call (&label("key_192b"));
3142 &$movekey (&QWP(0,$key),"xmm0");
3143 &mov (&DWP(48,$key),$rounds);
3145 &jmp (&label("good_key"));
3147 &set_label("key_192a",16);
3148 &$movekey (&QWP(0,$key),"xmm0");
3149 &lea ($key,&DWP(16,$key));
3150 &set_label("key_192a_cold",16);
3151 &movaps ("xmm5","xmm2");
3152 &set_label("key_192b_warm");
3153 &shufps ("xmm4","xmm0",0b00010000);
3154 &movdqa ("xmm3","xmm2");
3155 &xorps ("xmm0","xmm4");
3156 &shufps ("xmm4","xmm0",0b10001100);
3158 &xorps ("xmm0","xmm4");
3159 &pshufd ("xmm1","xmm1",0b01010101); # critical path
3160 &pxor ("xmm2","xmm3");
3161 &pxor ("xmm0","xmm1");
3162 &pshufd ("xmm3","xmm0",0b11111111);
3163 &pxor ("xmm2","xmm3");
3166 &set_label("key_192b",16);
3167 &movaps ("xmm3","xmm0");
3168 &shufps ("xmm5","xmm0",0b01000100);
3169 &$movekey (&QWP(0,$key),"xmm5");
3170 &shufps ("xmm3","xmm2",0b01001110);
3171 &$movekey (&QWP(16,$key),"xmm3");
3172 &lea ($key,&DWP(32,$key));
3173 &jmp (&label("key_192b_warm"));
3175 &set_label("12rounds_alt",16);
3176 &movdqa ("xmm5",&QWP(0x10,"ebx"));
3177 &movdqa ("xmm4",&QWP(0x20,"ebx"));
3179 &movdqu (&QWP(-16,$key),"xmm0");
3181 &set_label("loop_key192");
3182 &movq (&QWP(0,$key),"xmm2");
3183 &movdqa ("xmm1","xmm2");
3184 &pshufb ("xmm2","xmm5");
3185 &aesenclast ("xmm2","xmm4");
3187 &lea ($key,&DWP(24,$key));
3189 &movdqa ("xmm3","xmm0");
3191 &pxor ("xmm3","xmm0");
3193 &pxor ("xmm3","xmm0");
3195 &pxor ("xmm0","xmm3");
3197 &pshufd ("xmm3","xmm0",0xff);
3198 &pxor ("xmm3","xmm1");
3200 &pxor ("xmm3","xmm1");
3202 &pxor ("xmm0","xmm2");
3203 &pxor ("xmm2","xmm3");
3204 &movdqu (&QWP(-16,$key),"xmm0");
3207 &jnz (&label("loop_key192"));
3210 &mov (&DWP(32,$key),$rounds);
3212 &jmp (&label("good_key"));
3214 &set_label("14rounds",16);
3215 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
3216 &lea ($key,&DWP(16,$key));
3218 &je (&label("14rounds_alt"));
3221 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
3222 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
3223 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
3224 &call (&label("key_256a_cold"));
3225 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
3226 &call (&label("key_256b"));
3227 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
3228 &call (&label("key_256a"));
3229 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
3230 &call (&label("key_256b"));
3231 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
3232 &call (&label("key_256a"));
3233 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
3234 &call (&label("key_256b"));
3235 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
3236 &call (&label("key_256a"));
3237 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
3238 &call (&label("key_256b"));
3239 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
3240 &call (&label("key_256a"));
3241 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
3242 &call (&label("key_256b"));
3243 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
3244 &call (&label("key_256a"));
3245 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
3246 &call (&label("key_256b"));
3247 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
3248 &call (&label("key_256a"));
3249 &$movekey (&QWP(0,$key),"xmm0");
3250 &mov (&DWP(16,$key),$rounds);
3253 &jmp (&label("good_key"));
3255 &set_label("key_256a",16);
3256 &$movekey (&QWP(0,$key),"xmm2");
3257 &lea ($key,&DWP(16,$key));
3258 &set_label("key_256a_cold");
3259 &shufps ("xmm4","xmm0",0b00010000);
3260 &xorps ("xmm0","xmm4");
3261 &shufps ("xmm4","xmm0",0b10001100);
3262 &xorps ("xmm0","xmm4");
3263 &shufps ("xmm1","xmm1",0b11111111); # critical path
3264 &xorps ("xmm0","xmm1");
3267 &set_label("key_256b",16);
3268 &$movekey (&QWP(0,$key),"xmm0");
3269 &lea ($key,&DWP(16,$key));
3271 &shufps ("xmm4","xmm2",0b00010000);
3272 &xorps ("xmm2","xmm4");
3273 &shufps ("xmm4","xmm2",0b10001100);
3274 &xorps ("xmm2","xmm4");
3275 &shufps ("xmm1","xmm1",0b10101010); # critical path
3276 &xorps ("xmm2","xmm1");
3279 &set_label("14rounds_alt",16);
3280 &movdqa ("xmm5",&QWP(0x00,"ebx"));
3281 &movdqa ("xmm4",&QWP(0x20,"ebx"));
3283 &movdqu (&QWP(-32,$key),"xmm0");
3284 &movdqa ("xmm1","xmm2");
3285 &movdqu (&QWP(-16,$key),"xmm2");
3287 &set_label("loop_key256");
3288 &pshufb ("xmm2","xmm5");
3289 &aesenclast ("xmm2","xmm4");
3291 &movdqa ("xmm3","xmm0");
3293 &pxor ("xmm3","xmm0");
3295 &pxor ("xmm3","xmm0");
3297 &pxor ("xmm0","xmm3");
3300 &pxor ("xmm0","xmm2");
3301 &movdqu (&QWP(0,$key),"xmm0");
3304 &jz (&label("done_key256"));
3306 &pshufd ("xmm2","xmm0",0xff);
3307 &pxor ("xmm3","xmm3");
3308 &aesenclast ("xmm2","xmm3");
3310 &movdqa ("xmm3","xmm1");
3312 &pxor ("xmm3","xmm1");
3314 &pxor ("xmm3","xmm1");
3316 &pxor ("xmm1","xmm3");
3318 &pxor ("xmm2","xmm1");
3319 &movdqu (&QWP(16,$key),"xmm2");
3320 &lea ($key,&DWP(32,$key));
3321 &movdqa ("xmm1","xmm2");
3322 &jmp (&label("loop_key256"));
3324 &set_label("done_key256");
3326 &mov (&DWP(16,$key),$rounds);
3328 &set_label("good_key");
3329 &pxor ("xmm0","xmm0");
3330 &pxor ("xmm1","xmm1");
3331 &pxor ("xmm2","xmm2");
3332 &pxor ("xmm3","xmm3");
3333 &pxor ("xmm4","xmm4");
3334 &pxor ("xmm5","xmm5");
3340 &set_label("bad_pointer",4);
3345 &set_label("bad_keybits",4);
3346 &pxor ("xmm0","xmm0");
3351 &function_end_B("_aesni_set_encrypt_key");
3353 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
3355 &function_begin_B("${PREFIX}_set_encrypt_key");
3356 &mov ("eax",&wparam(0));
3357 &mov ($rounds,&wparam(1));
3358 &mov ($key,&wparam(2));
3359 &call ("_aesni_set_encrypt_key");
3361 &function_end_B("${PREFIX}_set_encrypt_key");
3363 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
3365 &function_begin_B("${PREFIX}_set_decrypt_key");
3366 &mov ("eax",&wparam(0));
3367 &mov ($rounds,&wparam(1));
3368 &mov ($key,&wparam(2));
3369 &call ("_aesni_set_encrypt_key");
3370 &mov ($key,&wparam(2));
3371 &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key
3372 &test ("eax","eax");
3373 &jnz (&label("dec_key_ret"));
3374 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
3376 &$movekey ("xmm0",&QWP(0,$key)); # just swap
3377 &$movekey ("xmm1",&QWP(0,"eax"));
3378 &$movekey (&QWP(0,"eax"),"xmm0");
3379 &$movekey (&QWP(0,$key),"xmm1");
3380 &lea ($key,&DWP(16,$key));
3381 &lea ("eax",&DWP(-16,"eax"));
3383 &set_label("dec_key_inverse");
3384 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
3385 &$movekey ("xmm1",&QWP(0,"eax"));
3386 &aesimc ("xmm0","xmm0");
3387 &aesimc ("xmm1","xmm1");
3388 &lea ($key,&DWP(16,$key));
3389 &lea ("eax",&DWP(-16,"eax"));
3390 &$movekey (&QWP(16,"eax"),"xmm0");
3391 &$movekey (&QWP(-16,$key),"xmm1");
3393 &ja (&label("dec_key_inverse"));
3395 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
3396 &aesimc ("xmm0","xmm0");
3397 &$movekey (&QWP(0,$key),"xmm0");
3399 &pxor ("xmm0","xmm0");
3400 &pxor ("xmm1","xmm1");
3401 &xor ("eax","eax"); # return success
3402 &set_label("dec_key_ret");
3404 &function_end_B("${PREFIX}_set_decrypt_key");
3406 &set_label("key_const",64);
3407 &data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
3408 &data_word(0x04070605,0x04070605,0x04070605,0x04070605);
3409 &data_word(1,1,1,1);
3410 &data_word(0x1b,0x1b,0x1b,0x1b);
3411 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");