3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # This module implements support for Intel AES-NI extension. In
11 # OpenSSL context it's used with Intel engine, but can also be used as
12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
15 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
16 # generates drop-in replacement for
17 # crypto/aes/asm/aes-586.pl:-)
18 $inline=1; # inline _aesni_[en|de]crypt
20 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
21 push(@INC,"${dir}","${dir}../../perlasm");
24 &asm_init($ARGV[0],$0);
26 if ($PREFIX eq "aesni") { $movekey=*movaps; }
27 else { $movekey=*movups; }
34 $rounds_="ebx"; # backup copy for $rounds
35 $key_="ebp"; # backup copy for $key
44 $in1="xmm7"; $inout3="xmm7";
46 # Inline version of internal aesni_[en|de]crypt1
47 sub aesni_inline_generate1
50 &$movekey ($rndkey0,&QWP(0,$key));
51 &$movekey ($rndkey1,&QWP(16,$key));
52 &lea ($key,&DWP(32,$key));
53 &pxor ($inout0,$rndkey0);
54 &set_label("${p}1_loop");
55 eval"&aes${p} ($inout0,$rndkey1)";
57 &$movekey ($rndkey1,&QWP(0,$key));
58 &lea ($key,&DWP(16,$key));
59 &jnz (&label("${p}1_loop"));
60 eval"&aes${p}last ($inout0,$rndkey1)";
63 sub aesni_generate1 # fully unrolled loop
66 &function_begin_B("_aesni_${p}rypt1");
67 &$movekey ($rndkey0,&QWP(0,$key));
68 &$movekey ($rndkey1,&QWP(0x10,$key));
70 &pxor ($inout0,$rndkey0);
71 &$movekey ($rndkey0,&QWP(0x20,$key));
72 &lea ($key,&DWP(0x30,$key));
73 &jb (&label("${p}128"));
74 &lea ($key,&DWP(0x20,$key));
75 &je (&label("${p}192"));
76 &lea ($key,&DWP(0x20,$key));
77 eval"&aes${p} ($inout0,$rndkey1)";
78 &$movekey ($rndkey1,&QWP(-0x40,$key));
79 eval"&aes${p} ($inout0,$rndkey0)";
80 &$movekey ($rndkey0,&QWP(-0x30,$key));
81 &set_label("${p}192");
82 eval"&aes${p} ($inout0,$rndkey1)";
83 &$movekey ($rndkey1,&QWP(-0x20,$key));
84 eval"&aes${p} ($inout0,$rndkey0)";
85 &$movekey ($rndkey0,&QWP(-0x10,$key));
86 &set_label("${p}128");
87 eval"&aes${p} ($inout0,$rndkey1)";
88 &$movekey ($rndkey1,&QWP(0,$key));
89 eval"&aes${p} ($inout0,$rndkey0)";
90 &$movekey ($rndkey0,&QWP(0x10,$key));
91 eval"&aes${p} ($inout0,$rndkey1)";
92 &$movekey ($rndkey1,&QWP(0x20,$key));
93 eval"&aes${p} ($inout0,$rndkey0)";
94 &$movekey ($rndkey0,&QWP(0x30,$key));
95 eval"&aes${p} ($inout0,$rndkey1)";
96 &$movekey ($rndkey1,&QWP(0x40,$key));
97 eval"&aes${p} ($inout0,$rndkey0)";
98 &$movekey ($rndkey0,&QWP(0x50,$key));
99 eval"&aes${p} ($inout0,$rndkey1)";
100 &$movekey ($rndkey1,&QWP(0x60,$key));
101 eval"&aes${p} ($inout0,$rndkey0)";
102 &$movekey ($rndkey0,&QWP(0x70,$key));
103 eval"&aes${p} ($inout0,$rndkey1)";
104 eval"&aes${p}last ($inout0,$rndkey0)";
106 &function_end_B("_aesni_${p}rypt1");
109 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
110 &aesni_generate1("enc") if (!$inline);
111 &function_begin_B("${PREFIX}_encrypt");
112 &mov ("eax",&wparam(0));
113 &mov ($key,&wparam(2));
114 &movups ($inout0,&QWP(0,"eax"));
115 &mov ($rounds,&DWP(240,$key));
116 &mov ("eax",&wparam(1));
118 { &aesni_inline_generate1("enc"); }
120 { &call ("_aesni_encrypt1"); }
121 &movups (&QWP(0,"eax"),$inout0);
123 &function_end_B("${PREFIX}_encrypt");
125 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
126 &aesni_generate1("dec") if(!$inline);
127 &function_begin_B("${PREFIX}_decrypt");
128 &mov ("eax",&wparam(0));
129 &mov ($key,&wparam(2));
130 &movups ($inout0,&QWP(0,"eax"));
131 &mov ($rounds,&DWP(240,$key));
132 &mov ("eax",&wparam(1));
134 { &aesni_inline_generate1("dec"); }
136 { &call ("_aesni_decrypt1"); }
137 &movups (&QWP(0,"eax"),$inout0);
139 &function_end_B("${PREFIX}_decrypt");
141 # _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
142 # factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
143 # latency is 6, it turned out that it can be scheduled only every
144 # *second* cycle. Thus 3x interleave is the one providing optimal
145 # utilization, i.e. when subroutine's throughput is virtually same as
146 # of non-interleaved subroutine [for number of input blocks up to 3].
147 # This is why it makes no sense to implement 2x subroutine. As soon
148 # as/if Intel improves throughput by making it possible to schedule
149 # the instructions in question *every* cycles I would have to
150 # implement 6x interleave and use it in loop...
154 &function_begin_B("_aesni_${p}rypt3");
155 &$movekey ($rndkey0,&QWP(0,$key));
157 &$movekey ($rndkey1,&QWP(16,$key));
158 &lea ($key,&DWP(32,$key));
159 &pxor ($inout0,$rndkey0);
160 &pxor ($inout1,$rndkey0);
161 &pxor ($inout2,$rndkey0);
162 &jmp (&label("${p}3_loop"));
163 &set_label("${p}3_loop",16);
164 eval"&aes${p} ($inout0,$rndkey1)";
165 &$movekey ($rndkey0,&QWP(0,$key));
166 eval"&aes${p} ($inout1,$rndkey1)";
168 eval"&aes${p} ($inout2,$rndkey1)";
169 &$movekey ($rndkey1,&QWP(16,$key));
170 eval"&aes${p} ($inout0,$rndkey0)";
171 &lea ($key,&DWP(32,$key));
172 eval"&aes${p} ($inout1,$rndkey0)";
173 eval"&aes${p} ($inout2,$rndkey0)";
174 &jnz (&label("${p}3_loop"));
175 eval"&aes${p} ($inout0,$rndkey1)";
176 &$movekey ($rndkey0,&QWP(0,$key));
177 eval"&aes${p} ($inout1,$rndkey1)";
178 eval"&aes${p} ($inout2,$rndkey1)";
179 eval"&aes${p}last ($inout0,$rndkey0)";
180 eval"&aes${p}last ($inout1,$rndkey0)";
181 eval"&aes${p}last ($inout2,$rndkey0)";
183 &function_end_B("_aesni_${p}rypt3");
186 # 4x interleave is implemented to improve small block performance,
187 # most notably [and naturally] 4 block by ~30%. One can argue that one
188 # should have implemented 5x as well, but improvement would be <20%,
189 # so it's not worth it...
193 &function_begin_B("_aesni_${p}rypt4");
194 &$movekey ($rndkey0,&QWP(0,$key));
195 &$movekey ($rndkey1,&QWP(16,$key));
197 &lea ($key,&DWP(32,$key));
198 &pxor ($inout0,$rndkey0);
199 &pxor ($inout1,$rndkey0);
200 &pxor ($inout2,$rndkey0);
201 &pxor ($inout3,$rndkey0);
202 &jmp (&label("${p}3_loop"));
203 &set_label("${p}3_loop",16);
204 eval"&aes${p} ($inout0,$rndkey1)";
205 &$movekey ($rndkey0,&QWP(0,$key));
206 eval"&aes${p} ($inout1,$rndkey1)";
208 eval"&aes${p} ($inout2,$rndkey1)";
209 eval"&aes${p} ($inout3,$rndkey1)";
210 &$movekey ($rndkey1,&QWP(16,$key));
211 eval"&aes${p} ($inout0,$rndkey0)";
212 &lea ($key,&DWP(32,$key));
213 eval"&aes${p} ($inout1,$rndkey0)";
214 eval"&aes${p} ($inout2,$rndkey0)";
215 eval"&aes${p} ($inout3,$rndkey0)";
216 &jnz (&label("${p}3_loop"));
217 eval"&aes${p} ($inout0,$rndkey1)";
218 &$movekey ($rndkey0,&QWP(0,$key));
219 eval"&aes${p} ($inout1,$rndkey1)";
220 eval"&aes${p} ($inout2,$rndkey1)";
221 eval"&aes${p} ($inout3,$rndkey1)";
222 eval"&aes${p}last ($inout0,$rndkey0)";
223 eval"&aes${p}last ($inout1,$rndkey0)";
224 eval"&aes${p}last ($inout2,$rndkey0)";
225 eval"&aes${p}last ($inout3,$rndkey0)";
227 &function_end_B("_aesni_${p}rypt4");
229 &aesni_generate3("enc") if ($PREFIX eq "aesni");
230 &aesni_generate3("dec");
231 &aesni_generate4("enc") if ($PREFIX eq "aesni");
232 &aesni_generate4("dec");
234 if ($PREFIX eq "aesni") {
235 ######################################################################
236 # void aesni_ecb_encrypt (const void *in, void *out,
237 # size_t length, const AES_KEY *key,
239 &function_begin("aesni_ecb_encrypt");
240 &mov ($inp,&wparam(0));
241 &mov ($out,&wparam(1));
242 &mov ($len,&wparam(2));
243 &mov ($key,&wparam(3));
244 &mov ($rounds,&wparam(4));
246 &jb (&label("ecb_ret"));
248 &test ($rounds,$rounds)
249 &mov ($rounds,&DWP(240,$key));
250 &mov ($key_,$key); # backup $key
251 &mov ($rounds_,$rounds); # backup $rounds
252 &jz (&label("ecb_decrypt"));
255 &jbe (&label("ecb_enc_tail"));
257 &jmp (&label("ecb_enc_loop3"));
259 &set_label("ecb_enc_loop3",16);
260 &movups ($inout0,&QWP(0,$inp));
261 &movups ($inout1,&QWP(0x10,$inp));
262 &movups ($inout2,&QWP(0x20,$inp));
263 &call ("_aesni_encrypt3");
265 &lea ($inp,&DWP(0x30,$inp));
266 &lea ($out,&DWP(0x30,$out));
267 &movups (&QWP(-0x30,$out),$inout0);
268 &mov ($key,$key_); # restore $key
269 &movups (&QWP(-0x20,$out),$inout1);
270 &mov ($rounds,$rounds_); # restore $rounds
271 &movups (&QWP(-0x10,$out),$inout2);
272 &ja (&label("ecb_enc_loop3"));
275 &jz (&label("ecb_ret"));
277 &set_label("ecb_enc_tail");
279 &movups ($inout0,&QWP(0,$inp));
280 &jb (&label("ecb_enc_one"));
281 &movups ($inout1,&QWP(0x10,$inp));
282 &je (&label("ecb_enc_two"));
284 &movups ($inout2,&QWP(0x20,$inp));
285 &je (&label("ecb_enc_three"));
286 &movups ($inout3,&QWP(0x30,$inp));
287 &call ("_aesni_encrypt4");
288 &movups (&QWP(0,$out),$inout0);
289 &movups (&QWP(0x10,$out),$inout1);
290 &movups (&QWP(0x20,$out),$inout2);
291 &movups (&QWP(0x30,$out),$inout3);
292 jmp (&label("ecb_ret"));
294 &set_label("ecb_enc_one",16);
296 { &aesni_inline_generate1("enc"); }
298 { &call ("_aesni_encrypt1"); }
299 &movups (&QWP(0,$out),$inout0);
300 &jmp (&label("ecb_ret"));
302 &set_label("ecb_enc_two",16);
303 &call ("_aesni_encrypt3");
304 &movups (&QWP(0,$out),$inout0);
305 &movups (&QWP(0x10,$out),$inout1);
306 &jmp (&label("ecb_ret"));
308 &set_label("ecb_enc_three",16);
309 &call ("_aesni_encrypt3");
310 &movups (&QWP(0,$out),$inout0);
311 &movups (&QWP(0x10,$out),$inout1);
312 &movups (&QWP(0x20,$out),$inout2);
313 &jmp (&label("ecb_ret"));
314 ######################################################################
315 &set_label("ecb_decrypt",16);
317 &jbe (&label("ecb_dec_tail"));
319 &jmp (&label("ecb_dec_loop3"));
321 &set_label("ecb_dec_loop3",16);
322 &movups ($inout0,&QWP(0,$inp));
323 &movups ($inout1,&QWP(0x10,$inp));
324 &movups ($inout2,&QWP(0x20,$inp));
325 &call ("_aesni_decrypt3");
327 &lea ($inp,&DWP(0x30,$inp));
328 &lea ($out,&DWP(0x30,$out));
329 &movups (&QWP(-0x30,$out),$inout0);
330 &mov ($key,$key_); # restore $key
331 &movups (&QWP(-0x20,$out),$inout1);
332 &mov ($rounds,$rounds_); # restore $rounds
333 &movups (&QWP(-0x10,$out),$inout2);
334 &ja (&label("ecb_dec_loop3"));
337 &jz (&label("ecb_ret"));
339 &set_label("ecb_dec_tail");
341 &movups ($inout0,&QWP(0,$inp));
342 &jb (&label("ecb_dec_one"));
343 &movups ($inout1,&QWP(0x10,$inp));
344 &je (&label("ecb_dec_two"));
346 &movups ($inout2,&QWP(0x20,$inp));
347 &je (&label("ecb_dec_three"));
348 &movups ($inout3,&QWP(0x30,$inp));
349 &call ("_aesni_decrypt4");
350 &movups (&QWP(0,$out),$inout0);
351 &movups (&QWP(0x10,$out),$inout1);
352 &movups (&QWP(0x20,$out),$inout2);
353 &movups (&QWP(0x30,$out),$inout3);
354 &jmp (&label("ecb_ret"));
356 &set_label("ecb_dec_one",16);
358 { &aesni_inline_generate1("dec"); }
360 { &call ("_aesni_decrypt1"); }
361 &movups (&QWP(0,$out),$inout0);
362 &jmp (&label("ecb_ret"));
364 &set_label("ecb_dec_two",16);
365 &call ("_aesni_decrypt3");
366 &movups (&QWP(0,$out),$inout0);
367 &movups (&QWP(0x10,$out),$inout1);
368 &jmp (&label("ecb_ret"));
370 &set_label("ecb_dec_three",16);
371 &call ("_aesni_decrypt3");
372 &movups (&QWP(0,$out),$inout0);
373 &movups (&QWP(0x10,$out),$inout1);
374 &movups (&QWP(0x20,$out),$inout2);
376 &set_label("ecb_ret");
377 &function_end("aesni_ecb_encrypt");
379 ######################################################################
380 # handles only complete blocks, operates on 32-bit counter and
381 # does not update *ivec! (see engine/eng_aesni.c for details)
383 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
384 # size_t blocks, const AES_KEY *key,
386 &function_begin("aesni_ctr32_encrypt_blocks");
387 &mov ($inp,&wparam(0));
388 &mov ($out,&wparam(1));
389 &mov ($len,&wparam(2));
390 &mov ($key,&wparam(3));
391 &mov ($rounds_,&wparam(4));
394 &and ("esp",-16); # align stack
395 &mov (&DWP(48,"esp"),$key_);
397 &movups ($inout3,&QWP(0,$rounds_)); # load ivec
399 # compose byte-swap control mask for pshufb on stack
400 &mov (&DWP(0,"esp"),0x0c0d0e0f);
401 &mov (&DWP(4,"esp"),0x08090a0b);
402 &mov (&DWP(8,"esp"),0x04050607);
403 &mov (&DWP(12,"esp"),0x00010203);
405 # compose counter increment vector on stack
408 &mov (&DWP(16,"esp"),$rounds);
409 &mov (&DWP(20,"esp"),$rounds);
410 &mov (&DWP(24,"esp"),$rounds);
411 &mov (&DWP(28,"esp"),$key_);
413 &pextrd ($rounds_,$inout3,3); # pull 32-bit counter
414 &pinsrd ($inout3,$key_,3); # wipe 32-bit counter
416 &mov ($rounds,&DWP(240,$key)); # key->rounds
417 &movaps ($rndkey0,&QWP(0,"esp")); # load byte-swap mask
419 # $ivec is vector of 3 32-bit counters
422 &pinsrd ($ivec,$rounds_,0);
424 &pinsrd ($ivec,$rounds_,1);
426 &pinsrd ($ivec,$rounds_,2);
429 &pshufb ($ivec,$rndkey0); # byte swap
430 &jbe (&label("ctr32_tail"));
431 &movaps (&QWP(32,"esp"),$inout3); # save counter-less ivec
432 &mov ($rounds_,$rounds);
435 &jmp (&label("ctr32_loop3"));
437 &set_label("ctr32_loop3",16);
438 &pshufd ($inout0,$ivec,3<<6); # place counter to upper dword
439 &pshufd ($inout1,$ivec,2<<6);
440 &pshufd ($inout2,$ivec,1<<6);
441 &por ($inout0,$inout3); # merge counter-less ivec
442 &por ($inout1,$inout3);
443 &por ($inout2,$inout3);
445 &call ("_aesni_encrypt3");
447 &movaps($rndkey0,&QWP(0,"esp")); # load byte-swap mask
448 &movups ($in0,&QWP(0,$inp));
449 &movups ($in1,&QWP(0x10,$inp));
450 &movups ($rndkey1,&QWP(0x20,$inp));
451 &pshufb($ivec,$rndkey0); # byte swap
452 &paddd ($ivec,&QWP(16,"esp")); # counter increment
453 &pxor ($in0,$inout0);
454 &pxor ($in1,$inout1);
455 &pxor ($rndkey1,$inout2);
456 &movups (&QWP(0,$out),$in0);
457 &movups (&QWP(0x10,$out),$in1);
458 &movups (&QWP(0x20,$out),$rndkey1);
459 &movaps ($inout3,&QWP(32,"esp")); # load counter-less ivec
460 &pshufb($ivec,$rndkey0); # byte swap
463 &lea ($inp,&DWP(0x30,$inp));
464 &lea ($out,&DWP(0x30,$out));
466 &mov ($rounds,$rounds_);
467 &ja (&label("ctr32_loop3"));
470 &pextrd ($rounds_,$ivec,1); # might need last counter value
471 &jz (&label("ctr32_ret"));
474 &set_label("ctr32_tail");
476 &pshufd ($inout0,$ivec,3<<6);
477 &pshufd ($inout1,$ivec,2<<6);
478 &pshufd ($inout2,$ivec,1<<6);
479 &por ($inout0,$inout3);
480 &jb (&label("ctr32_one"));
481 &por ($inout1,$inout3);
482 &je (&label("ctr32_two"));
484 &por ($inout2,$inout3);
485 &je (&label("ctr32_three"));
487 &inc ($rounds_); # compose last counter value
489 &pinsrd ($inout3,$rounds_,3);
491 &call ("_aesni_encrypt4");
493 &movups ($in0,&QWP(0,$inp));
494 &movups ($rndkey1,&QWP(0x10,$inp));
495 &movups ($rndkey0,&QWP(0x20,$inp));
496 &movups ($ivec,&QWP(0x30,$inp));
497 &pxor ($in0,$inout0);
498 &pxor ($rndkey1,$inout1);
499 &pxor ($rndkey0,$inout2);
500 &pxor ($ivec,$inout3);
501 &movups (&QWP(0,$out),$in0);
502 &movups (&QWP(0x10,$out),$rndkey1);
503 &movups (&QWP(0x20,$out),$rndkey0);
504 &movups (&QWP(0x30,$out),$ivec);
505 &jmp (&label("ctr32_ret"));
507 &set_label("ctr32_one",16);
509 { &aesni_inline_generate1("enc"); }
511 { &call ("_aesni_encrypt1"); }
512 &movups ($in0,&QWP(0,$inp));
513 &pxor ($in0,$inout0);
514 &movups (&QWP(0,$out),$in0);
515 &jmp (&label("ctr32_ret"));
517 &set_label("ctr32_two",16);
518 &call ("_aesni_encrypt3");
519 &movups ($in0,&QWP(0,$inp));
520 &movups ($in1,&QWP(0x10,$inp));
521 &pxor ($in0,$inout0);
522 &pxor ($in1,$inout1);
523 &movups (&QWP(0,$out),$in0);
524 &movups (&QWP(0x10,$out),$in1);
525 &jmp (&label("ctr32_ret"));
527 &set_label("ctr32_three",16);
528 &call ("_aesni_encrypt3");
529 &movups ($in0,&QWP(0,$inp));
530 &movups ($in1,&QWP(0x10,$inp));
531 &movups ($rndkey1,&QWP(0x20,$inp));
532 &pxor ($in0,$inout0);
533 &pxor ($in1,$inout1);
534 &pxor ($rndkey1,$inout2);
535 &movups (&QWP(0,$out),$in0);
536 &movups (&QWP(0x10,$out),$in1);
537 &movups (&QWP(0x20,$out),$rndkey1);
539 &set_label("ctr32_ret");
540 &mov ("esp",&DWP(48,"esp"));
541 &function_end("aesni_ctr32_encrypt_blocks");
544 ######################################################################
545 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
546 # size_t length, const AES_KEY *key,
547 # unsigned char *ivp,const int enc);
548 &function_begin("${PREFIX}_cbc_encrypt");
549 &mov ($inp,&wparam(0));
550 &mov ($out,&wparam(1));
551 &mov ($len,&wparam(2));
552 &mov ($key,&wparam(3));
554 &mov ($key_,&wparam(4));
555 &jz (&label("cbc_ret"));
558 &movups ($ivec,&QWP(0,$key_)); # load IV
559 &mov ($rounds,&DWP(240,$key));
560 &mov ($key_,$key); # backup $key
561 &mov ($rounds_,$rounds); # backup $rounds
562 &je (&label("cbc_decrypt"));
564 &movaps ($inout0,$ivec);
566 &jb (&label("cbc_enc_tail"));
568 &jmp (&label("cbc_enc_loop"));
570 &set_label("cbc_enc_loop",16);
571 &movups ($ivec,&QWP(0,$inp));
572 &lea ($inp,&DWP(16,$inp));
573 &pxor ($inout0,$ivec);
575 { &aesni_inline_generate1("enc"); }
577 { &call ("_aesni_encrypt1"); }
579 &lea ($out,&DWP(16,$out));
580 &mov ($rounds,$rounds_); # restore $rounds
581 &mov ($key,$key_); # restore $key
582 &movups (&QWP(-16,$out),$inout0);
583 &jnc (&label("cbc_enc_loop"));
585 &jnz (&label("cbc_enc_tail"));
586 &movaps ($ivec,$inout0);
587 &jmp (&label("cbc_ret"));
589 &set_label("cbc_enc_tail");
590 &mov ("ecx",$len); # zaps $rounds
591 &data_word(0xA4F3F689); # rep movsb
592 &mov ("ecx",16); # zero tail
594 &xor ("eax","eax"); # zaps $len
595 &data_word(0xAAF3F689); # rep stosb
596 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
597 &mov ($rounds,$rounds_); # restore $rounds
598 &mov ($inp,$out); # $inp and $out are the same
599 &mov ($key,$key_); # restore $key
600 &jmp (&label("cbc_enc_loop"));
601 ######################################################################
602 &set_label("cbc_decrypt",16);
604 &jbe (&label("cbc_dec_tail"));
606 &jmp (&label("cbc_dec_loop3"));
608 &set_label("cbc_dec_loop3",16);
609 &movups ($inout0,&QWP(0,$inp));
610 &movups ($inout1,&QWP(0x10,$inp));
611 &movups ($inout2,&QWP(0x20,$inp));
612 &movaps ($in0,$inout0);
613 &movaps ($in1,$inout1);
614 &call ("_aesni_decrypt3");
616 &lea ($inp,&DWP(0x30,$inp));
617 &lea ($out,&DWP(0x30,$out));
618 &pxor ($inout0,$ivec);
619 &pxor ($inout1,$in0);
620 &movups ($ivec,&QWP(-0x10,$inp));
621 &pxor ($inout2,$in1);
622 &movups (&QWP(-0x30,$out),$inout0);
623 &mov ($rounds,$rounds_) # restore $rounds
624 &movups (&QWP(-0x20,$out),$inout1);
625 &mov ($key,$key_); # restore $key
626 &movups (&QWP(-0x10,$out),$inout2);
627 &ja (&label("cbc_dec_loop3"));
630 &jz (&label("cbc_ret"));
632 &set_label("cbc_dec_tail");
633 &movups ($inout0,&QWP(0,$inp));
635 &movaps ($in0,$inout0);
636 &jbe (&label("cbc_dec_one"));
637 &movups ($inout1,&QWP(0x10,$inp));
639 &movaps ($in1,$inout1);
640 &jbe (&label("cbc_dec_two"));
641 &movups ($inout2,&QWP(0x20,$inp));
643 &jbe (&label("cbc_dec_three"));
644 &movups ($inout3,&QWP(0x30,$inp));
645 &call ("_aesni_decrypt4");
646 &movups ($rndkey0,&QWP(0x10,$inp));
647 &movups ($rndkey1,&QWP(0x20,$inp));
648 &pxor ($inout0,$ivec);
649 &pxor ($inout1,$in0);
650 &movups ($ivec,&QWP(0x30,$inp));
651 &movups (&QWP(0,$out),$inout0);
652 &pxor ($inout2,$rndkey0);
653 &pxor ($inout3,$rndkey1);
654 &movups (&QWP(0x10,$out),$inout1);
655 &movups (&QWP(0x20,$out),$inout2);
656 &movaps ($inout0,$inout3);
657 &lea ($out,&DWP(0x30,$out));
658 &jmp (&label("cbc_dec_tail_collected"));
660 &set_label("cbc_dec_one");
662 { &aesni_inline_generate1("dec"); }
664 { &call ("_aesni_decrypt1"); }
665 &pxor ($inout0,$ivec);
666 &movaps ($ivec,$in0);
667 &jmp (&label("cbc_dec_tail_collected"));
669 &set_label("cbc_dec_two");
670 &call ("_aesni_decrypt3");
671 &pxor ($inout0,$ivec);
672 &pxor ($inout1,$in0);
673 &movups (&QWP(0,$out),$inout0);
674 &movaps ($inout0,$inout1);
675 &movaps ($ivec,$in1);
676 &lea ($out,&DWP(0x10,$out));
677 &jmp (&label("cbc_dec_tail_collected"));
679 &set_label("cbc_dec_three");
680 &call ("_aesni_decrypt3");
681 &pxor ($inout0,$ivec);
682 &pxor ($inout1,$in0);
683 &pxor ($inout2,$in1);
684 &movups (&QWP(0,$out),$inout0);
685 &movups (&QWP(0x10,$out),$inout1);
686 &movaps ($inout0,$inout2);
687 &movups ($ivec,&QWP(0x20,$inp));
688 &lea ($out,&DWP(0x20,$out));
690 &set_label("cbc_dec_tail_collected");
692 &jnz (&label("cbc_dec_tail_partial"));
693 &movups (&QWP(0,$out),$inout0);
694 &jmp (&label("cbc_ret"));
696 &set_label("cbc_dec_tail_partial");
700 &movaps (&QWP(0,"esp"),$inout0);
703 &data_word(0xA4F3F689); # rep movsb
706 &set_label("cbc_ret");
707 &mov ($key_,&wparam(4));
708 &movups (&QWP(0,$key_),$ivec); # output IV
709 &function_end("${PREFIX}_cbc_encrypt");
711 ######################################################################
712 # Mechanical port from aesni-x86_64.pl.
714 # _aesni_set_encrypt_key is private interface,
716 # "eax" const unsigned char *userKey
723 &function_begin_B("_aesni_set_encrypt_key");
725 &jz (&label("bad_pointer"));
727 &jz (&label("bad_pointer"));
729 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
730 &pxor ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
731 &lea ($key,&DWP(16,$key));
733 &je (&label("14rounds"));
735 &je (&label("12rounds"));
737 &jne (&label("bad_keybits"));
739 &set_label("10rounds",16);
741 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
742 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
743 &call (&label("key_128_cold"));
744 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
745 &call (&label("key_128"));
746 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
747 &call (&label("key_128"));
748 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
749 &call (&label("key_128"));
750 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
751 &call (&label("key_128"));
752 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
753 &call (&label("key_128"));
754 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
755 &call (&label("key_128"));
756 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
757 &call (&label("key_128"));
758 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
759 &call (&label("key_128"));
760 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
761 &call (&label("key_128"));
762 &$movekey (&QWP(0,$key),"xmm0");
763 &mov (&DWP(80,$key),$rounds);
767 &set_label("key_128",16);
768 &$movekey (&QWP(0,$key),"xmm0");
769 &lea ($key,&DWP(16,$key));
770 &set_label("key_128_cold");
771 &shufps ("xmm4","xmm0",0b00010000);
772 &pxor ("xmm0","xmm4");
773 &shufps ("xmm4","xmm0",0b10001100,);
774 &pxor ("xmm0","xmm4");
775 &pshufd ("xmm1","xmm1",0b11111111); # critical path
776 &pxor ("xmm0","xmm1");
779 &set_label("12rounds",16);
780 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
782 &$movekey (&QWP(-16,$key),"xmm0") # round 0
783 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
784 &call (&label("key_192a_cold"));
785 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
786 &call (&label("key_192b"));
787 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
788 &call (&label("key_192a"));
789 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
790 &call (&label("key_192b"));
791 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
792 &call (&label("key_192a"));
793 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
794 &call (&label("key_192b"));
795 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
796 &call (&label("key_192a"));
797 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
798 &call (&label("key_192b"));
799 &$movekey (&QWP(0,$key),"xmm0");
800 &mov (&DWP(48,$key),$rounds);
804 &set_label("key_192a",16);
805 &$movekey (&QWP(0,$key),"xmm0");
806 &lea ($key,&DWP(16,$key));
807 &set_label("key_192a_cold",16);
808 &movaps ("xmm5","xmm2");
809 &set_label("key_192b_warm");
810 &shufps ("xmm4","xmm0",0b00010000);
811 &movaps ("xmm3","xmm2");
812 &pxor ("xmm0","xmm4");
813 &shufps ("xmm4","xmm0",0b10001100);
815 &pxor ("xmm0","xmm4");
816 &pshufd ("xmm1","xmm1",0b01010101); # critical path
817 &pxor ("xmm2","xmm3");
818 &pxor ("xmm0","xmm1");
819 &pshufd ("xmm3","xmm0",0b11111111);
820 &pxor ("xmm2","xmm3");
823 &set_label("key_192b",16);
824 &movaps ("xmm3","xmm0");
825 &shufps ("xmm5","xmm0",0b01000100);
826 &$movekey (&QWP(0,$key),"xmm5");
827 &shufps ("xmm3","xmm2",0b01001110);
828 &$movekey (&QWP(16,$key),"xmm3");
829 &lea ($key,&DWP(32,$key));
830 &jmp (&label("key_192b_warm"));
832 &set_label("14rounds",16);
833 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
835 &lea ($key,&DWP(16,$key));
836 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
837 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
838 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
839 &call (&label("key_256a_cold"));
840 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
841 &call (&label("key_256b"));
842 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
843 &call (&label("key_256a"));
844 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
845 &call (&label("key_256b"));
846 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
847 &call (&label("key_256a"));
848 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
849 &call (&label("key_256b"));
850 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
851 &call (&label("key_256a"));
852 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
853 &call (&label("key_256b"));
854 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
855 &call (&label("key_256a"));
856 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
857 &call (&label("key_256b"));
858 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
859 &call (&label("key_256a"));
860 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
861 &call (&label("key_256b"));
862 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
863 &call (&label("key_256a"));
864 &$movekey (&QWP(0,$key),"xmm0");
865 &mov (&DWP(16,$key),$rounds);
869 &set_label("key_256a",16);
870 &$movekey (&QWP(0,$key),"xmm2");
871 &lea ($key,&DWP(16,$key));
872 &set_label("key_256a_cold");
873 &shufps ("xmm4","xmm0",0b00010000);
874 &pxor ("xmm0","xmm4");
875 &shufps ("xmm4","xmm0",0b10001100);
876 &pxor ("xmm0","xmm4");
877 &pshufd ("xmm1","xmm1",0b11111111); # critical path
878 &pxor ("xmm0","xmm1");
881 &set_label("key_256b",16);
882 &$movekey (&QWP(0,$key),"xmm0");
883 &lea ($key,&DWP(16,$key));
885 &shufps ("xmm4","xmm2",0b00010000);
886 &pxor ("xmm2","xmm4");
887 &shufps ("xmm4","xmm2",0b10001100);
888 &pxor ("xmm2","xmm4");
889 &pshufd ("xmm1","xmm1",0b10101010); # critical path
890 &pxor ("xmm2","xmm1");
893 &set_label("bad_pointer",4);
896 &set_label("bad_keybits",4);
899 &function_end_B("_aesni_set_encrypt_key");
901 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
903 &function_begin_B("${PREFIX}_set_encrypt_key");
904 &mov ("eax",&wparam(0));
905 &mov ($rounds,&wparam(1));
906 &mov ($key,&wparam(2));
907 &call ("_aesni_set_encrypt_key");
909 &function_end_B("${PREFIX}_set_encrypt_key");
911 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
913 &function_begin_B("${PREFIX}_set_decrypt_key");
914 &mov ("eax",&wparam(0));
915 &mov ($rounds,&wparam(1));
916 &mov ($key,&wparam(2));
917 &call ("_aesni_set_encrypt_key");
918 &mov ($key,&wparam(2));
919 &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key
921 &jnz (&label("dec_key_ret"));
922 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
924 &$movekey ("xmm0",&QWP(0,$key)); # just swap
925 &$movekey ("xmm1",&QWP(0,"eax"));
926 &$movekey (&QWP(0,"eax"),"xmm0");
927 &$movekey (&QWP(0,$key),"xmm1");
928 &lea ($key,&DWP(16,$key));
929 &lea ("eax",&DWP(-16,"eax"));
931 &set_label("dec_key_inverse");
932 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
933 &$movekey ("xmm1",&QWP(0,"eax"));
934 &aesimc ("xmm0","xmm0");
935 &aesimc ("xmm1","xmm1");
936 &lea ($key,&DWP(16,$key));
937 &lea ("eax",&DWP(-16,"eax"));
939 &$movekey (&QWP(16,"eax"),"xmm0");
940 &$movekey (&QWP(-16,$key),"xmm1");
941 &ja (&label("dec_key_inverse"));
943 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
944 &aesimc ("xmm0","xmm0");
945 &$movekey (&QWP(0,$key),"xmm0");
947 &xor ("eax","eax"); # return success
948 &set_label("dec_key_ret");
950 &function_end_B("${PREFIX}_set_decrypt_key");
951 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");