3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # This module implements support for Intel AES-NI extension. In
11 # OpenSSL context it's used with Intel engine, but can also be used as
12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
15 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
16 # generates drop-in replacement for
17 # crypto/aes/asm/aes-586.pl:-)
18 $inline=1; # inline _aesni_[en|de]crypt
20 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
21 push(@INC,"${dir}","${dir}../../perlasm");
24 &asm_init($ARGV[0],$0);
26 $movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups");
33 $rounds_="ebx"; # backup copy for $rounds
34 $key_="ebp"; # backup copy for $key
43 $in1="xmm7"; $inout3="xmm7";
45 # Inline version of internal aesni_[en|de]crypt1
46 sub aesni_inline_generate1
49 &$movekey ($rndkey0,&QWP(0,$key));
50 &$movekey ($rndkey1,&QWP(16,$key));
51 &lea ($key,&DWP(32,$key));
52 &pxor ($inout0,$rndkey0);
53 &set_label("${p}1_loop");
54 eval"&aes${p} ($inout0,$rndkey1)";
56 &$movekey ($rndkey1,&QWP(0,$key));
57 &lea ($key,&DWP(16,$key));
58 &jnz (&label("${p}1_loop"));
59 eval"&aes${p}last ($inout0,$rndkey1)";
62 sub aesni_generate1 # fully unrolled loop
65 &function_begin_B("_aesni_${p}rypt1");
66 &$movekey ($rndkey0,&QWP(0,$key));
67 &$movekey ($rndkey1,&QWP(0x10,$key));
69 &pxor ($inout0,$rndkey0);
70 &$movekey ($rndkey0,&QWP(0x20,$key));
71 &lea ($key,&DWP(0x30,$key));
72 &jb (&label("${p}128"));
73 &lea ($key,&DWP(0x20,$key));
74 &je (&label("${p}192"));
75 &lea ($key,&DWP(0x20,$key));
76 eval"&aes${p} ($inout0,$rndkey1)";
77 &$movekey ($rndkey1,&QWP(-0x40,$key));
78 eval"&aes${p} ($inout0,$rndkey0)";
79 &$movekey ($rndkey0,&QWP(-0x30,$key));
80 &set_label("${p}192");
81 eval"&aes${p} ($inout0,$rndkey1)";
82 &$movekey ($rndkey1,&QWP(-0x20,$key));
83 eval"&aes${p} ($inout0,$rndkey0)";
84 &$movekey ($rndkey0,&QWP(-0x10,$key));
85 &set_label("${p}128");
86 eval"&aes${p} ($inout0,$rndkey1)";
87 &$movekey ($rndkey1,&QWP(0,$key));
88 eval"&aes${p} ($inout0,$rndkey0)";
89 &$movekey ($rndkey0,&QWP(0x10,$key));
90 eval"&aes${p} ($inout0,$rndkey1)";
91 &$movekey ($rndkey1,&QWP(0x20,$key));
92 eval"&aes${p} ($inout0,$rndkey0)";
93 &$movekey ($rndkey0,&QWP(0x30,$key));
94 eval"&aes${p} ($inout0,$rndkey1)";
95 &$movekey ($rndkey1,&QWP(0x40,$key));
96 eval"&aes${p} ($inout0,$rndkey0)";
97 &$movekey ($rndkey0,&QWP(0x50,$key));
98 eval"&aes${p} ($inout0,$rndkey1)";
99 &$movekey ($rndkey1,&QWP(0x60,$key));
100 eval"&aes${p} ($inout0,$rndkey0)";
101 &$movekey ($rndkey0,&QWP(0x70,$key));
102 eval"&aes${p} ($inout0,$rndkey1)";
103 eval"&aes${p}last ($inout0,$rndkey0)";
105 &function_end_B("_aesni_${p}rypt1");
108 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
109 &aesni_generate1("enc") if (!$inline);
110 &function_begin_B("${PREFIX}_encrypt");
111 &mov ("eax",&wparam(0));
112 &mov ($key,&wparam(2));
113 &movups ($inout0,&QWP(0,"eax"));
114 &mov ($rounds,&DWP(240,$key));
115 &mov ("eax",&wparam(1));
117 { &aesni_inline_generate1("enc"); }
119 { &call ("_aesni_encrypt1"); }
120 &movups (&QWP(0,"eax"),$inout0);
122 &function_end_B("${PREFIX}_encrypt");
124 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
125 &aesni_generate1("dec") if(!$inline);
126 &function_begin_B("${PREFIX}_decrypt");
127 &mov ("eax",&wparam(0));
128 &mov ($key,&wparam(2));
129 &movups ($inout0,&QWP(0,"eax"));
130 &mov ($rounds,&DWP(240,$key));
131 &mov ("eax",&wparam(1));
133 { &aesni_inline_generate1("dec"); }
135 { &call ("_aesni_decrypt1"); }
136 &movups (&QWP(0,"eax"),$inout0);
138 &function_end_B("${PREFIX}_decrypt");
140 # _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
141 # factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
142 # latency is 6, it turned out that it can be scheduled only every
143 # *second* cycle. Thus 3x interleave is the one providing optimal
144 # utilization, i.e. when subroutine's throughput is virtually same as
145 # of non-interleaved subroutine [for number of input blocks up to 3].
146 # This is why it makes no sense to implement 2x subroutine. As soon
147 # as/if Intel improves throughput by making it possible to schedule
148 # the instructions in question *every* cycles I would have to
149 # implement 6x interleave and use it in loop...
153 &function_begin_B("_aesni_${p}rypt3");
154 &$movekey ($rndkey0,&QWP(0,$key));
156 &$movekey ($rndkey1,&QWP(16,$key));
157 &lea ($key,&DWP(32,$key));
158 &pxor ($inout0,$rndkey0);
159 &pxor ($inout1,$rndkey0);
160 &pxor ($inout2,$rndkey0);
161 &jmp (&label("${p}3_loop"));
162 &set_label("${p}3_loop",16);
163 eval"&aes${p} ($inout0,$rndkey1)";
164 &$movekey ($rndkey0,&QWP(0,$key));
165 eval"&aes${p} ($inout1,$rndkey1)";
167 eval"&aes${p} ($inout2,$rndkey1)";
168 &$movekey ($rndkey1,&QWP(16,$key));
169 eval"&aes${p} ($inout0,$rndkey0)";
170 &lea ($key,&DWP(32,$key));
171 eval"&aes${p} ($inout1,$rndkey0)";
172 eval"&aes${p} ($inout2,$rndkey0)";
173 &jnz (&label("${p}3_loop"));
174 eval"&aes${p} ($inout0,$rndkey1)";
175 &$movekey ($rndkey0,&QWP(0,$key));
176 eval"&aes${p} ($inout1,$rndkey1)";
177 eval"&aes${p} ($inout2,$rndkey1)";
178 eval"&aes${p}last ($inout0,$rndkey0)";
179 eval"&aes${p}last ($inout1,$rndkey0)";
180 eval"&aes${p}last ($inout2,$rndkey0)";
182 &function_end_B("_aesni_${p}rypt3");
185 # 4x interleave is implemented to improve small block performance,
186 # most notably [and naturally] 4 block by ~30%. One can argue that one
187 # should have implemented 5x as well, but improvement would be <20%,
188 # so it's not worth it...
192 &function_begin_B("_aesni_${p}rypt4");
193 &$movekey ($rndkey0,&QWP(0,$key));
194 &$movekey ($rndkey1,&QWP(16,$key));
196 &lea ($key,&DWP(32,$key));
197 &pxor ($inout0,$rndkey0);
198 &pxor ($inout1,$rndkey0);
199 &pxor ($inout2,$rndkey0);
200 &pxor ($inout3,$rndkey0);
201 &jmp (&label("${p}3_loop"));
202 &set_label("${p}3_loop",16);
203 eval"&aes${p} ($inout0,$rndkey1)";
204 &$movekey ($rndkey0,&QWP(0,$key));
205 eval"&aes${p} ($inout1,$rndkey1)";
207 eval"&aes${p} ($inout2,$rndkey1)";
208 eval"&aes${p} ($inout3,$rndkey1)";
209 &$movekey ($rndkey1,&QWP(16,$key));
210 eval"&aes${p} ($inout0,$rndkey0)";
211 &lea ($key,&DWP(32,$key));
212 eval"&aes${p} ($inout1,$rndkey0)";
213 eval"&aes${p} ($inout2,$rndkey0)";
214 eval"&aes${p} ($inout3,$rndkey0)";
215 &jnz (&label("${p}3_loop"));
216 eval"&aes${p} ($inout0,$rndkey1)";
217 &$movekey ($rndkey0,&QWP(0,$key));
218 eval"&aes${p} ($inout1,$rndkey1)";
219 eval"&aes${p} ($inout2,$rndkey1)";
220 eval"&aes${p} ($inout3,$rndkey1)";
221 eval"&aes${p}last ($inout0,$rndkey0)";
222 eval"&aes${p}last ($inout1,$rndkey0)";
223 eval"&aes${p}last ($inout2,$rndkey0)";
224 eval"&aes${p}last ($inout3,$rndkey0)";
226 &function_end_B("_aesni_${p}rypt4");
228 &aesni_generate3("enc") if ($PREFIX eq "aesni");
229 &aesni_generate3("dec");
230 &aesni_generate4("enc") if ($PREFIX eq "aesni");
231 &aesni_generate4("dec");
233 if ($PREFIX eq "aesni") {
234 # void aesni_ecb_encrypt (const void *in, void *out,
235 # size_t length, const AES_KEY *key,
237 &function_begin("aesni_ecb_encrypt");
238 &mov ($inp,&wparam(0));
239 &mov ($out,&wparam(1));
240 &mov ($len,&wparam(2));
241 &mov ($key,&wparam(3));
242 &mov ($rounds,&wparam(4));
244 &jb (&label("ecb_ret"));
246 &test ($rounds,$rounds)
247 &mov ($rounds,&DWP(240,$key));
248 &mov ($key_,$key); # backup $key
249 &mov ($rounds_,$rounds); # backup $rounds
250 &jz (&label("ecb_decrypt"));
253 &jbe (&label("ecb_enc_tail"));
254 &jmp (&label("ecb_enc_loop3"));
256 &set_label("ecb_enc_loop3",16);
257 &movups ($inout0,&QWP(0,$inp));
258 &movups ($inout1,&QWP(0x10,$inp));
259 &movups ($inout2,&QWP(0x20,$inp));
260 &call ("_aesni_encrypt3");
262 &lea ($inp,&DWP(0x30,$inp));
263 &lea ($out,&DWP(0x30,$out));
264 &movups (&QWP(-0x30,$out),$inout0);
265 &mov ($key,$key_); # restore $key
266 &movups (&QWP(-0x20,$out),$inout1);
267 &mov ($rounds,$rounds_); # restore $rounds
268 &movups (&QWP(-0x10,$out),$inout2);
269 &ja (&label("ecb_enc_loop3"));
271 &set_label("ecb_enc_tail");
273 &jz (&label("ecb_ret"));
276 &movups ($inout0,&QWP(0,$inp));
277 &je (&label("ecb_enc_one"));
279 &movups ($inout1,&QWP(0x10,$inp));
280 &je (&label("ecb_enc_two"));
282 &movups ($inout2,&QWP(0x20,$inp));
283 &je (&label("ecb_enc_three"));
284 &movups ($inout3,&QWP(0x30,$inp));
285 &call ("_aesni_encrypt4");
286 &movups (&QWP(0,$out),$inout0);
287 &movups (&QWP(0x10,$out),$inout1);
288 &movups (&QWP(0x20,$out),$inout2);
289 &movups (&QWP(0x30,$out),$inout3);
290 jmp (&label("ecb_ret"));
292 &set_label("ecb_enc_one",16);
294 { &aesni_inline_generate1("enc"); }
296 { &call ("_aesni_encrypt1"); }
297 &movups (&QWP(0,$out),$inout0);
298 &jmp (&label("ecb_ret"));
300 &set_label("ecb_enc_two",16);
301 &call ("_aesni_encrypt3");
302 &movups (&QWP(0,$out),$inout0);
303 &movups (&QWP(0x10,$out),$inout1);
304 &jmp (&label("ecb_ret"));
306 &set_label("ecb_enc_three",16);
307 &call ("_aesni_encrypt3");
308 &movups (&QWP(0,$out),$inout0);
309 &movups (&QWP(0x10,$out),$inout1);
310 &movups (&QWP(0x20,$out),$inout2);
311 &jmp (&label("ecb_ret"));
313 &set_label("ecb_decrypt",16);
315 &jbe (&label("ecb_dec_tail"));
316 &jmp (&label("ecb_dec_loop3"));
318 &set_label("ecb_dec_loop3",16);
319 &movups ($inout0,&QWP(0,$inp));
320 &movups ($inout1,&QWP(0x10,$inp));
321 &movups ($inout2,&QWP(0x20,$inp));
322 &call ("_aesni_decrypt3");
324 &lea ($inp,&DWP(0x30,$inp));
325 &lea ($out,&DWP(0x30,$out));
326 &movups (&QWP(-0x30,$out),$inout0);
327 &mov ($key,$key_); # restore $key
328 &movups (&QWP(-0x20,$out),$inout1);
329 &mov ($rounds,$rounds_); # restore $rounds
330 &movups (&QWP(-0x10,$out),$inout2);
331 &ja (&label("ecb_dec_loop3"));
333 &set_label("ecb_dec_tail");
335 &jz (&label("ecb_ret"));
338 &movups ($inout0,&QWP(0,$inp));
339 &je (&label("ecb_dec_one"));
341 &movups ($inout1,&QWP(0x10,$inp));
342 &je (&label("ecb_dec_two"));
344 &movups ($inout2,&QWP(0x20,$inp));
345 &je (&label("ecb_dec_three"));
346 &movups ($inout3,&QWP(0x30,$inp));
347 &call ("_aesni_decrypt4");
348 &movups (&QWP(0,$out),$inout0);
349 &movups (&QWP(0x10,$out),$inout1);
350 &movups (&QWP(0x20,$out),$inout2);
351 &movups (&QWP(0x30,$out),$inout3);
352 &jmp (&label("ecb_ret"));
354 &set_label("ecb_dec_one",16);
356 { &aesni_inline_generate1("dec"); }
358 { &call ("_aesni_decrypt1"); }
359 &movups (&QWP(0,$out),$inout0);
360 &jmp (&label("ecb_ret"));
362 &set_label("ecb_dec_two",16);
363 &call ("_aesni_decrypt3");
364 &movups (&QWP(0,$out),$inout0);
365 &movups (&QWP(0x10,$out),$inout1);
366 &jmp (&label("ecb_ret"));
368 &set_label("ecb_dec_three",16);
369 &call ("_aesni_decrypt3");
370 &movups (&QWP(0,$out),$inout0);
371 &movups (&QWP(0x10,$out),$inout1);
372 &movups (&QWP(0x20,$out),$inout2);
374 &set_label("ecb_ret");
375 &function_end("aesni_ecb_encrypt");
378 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
379 # size_t length, const AES_KEY *key,
380 # unsigned char *ivp,const int enc);
381 &function_begin("${PREFIX}_cbc_encrypt");
382 &mov ($inp,&wparam(0));
383 &mov ($out,&wparam(1));
384 &mov ($len,&wparam(2));
385 &mov ($key,&wparam(3));
387 &mov ($key_,&wparam(4));
388 &jz (&label("cbc_ret"));
391 &movups ($ivec,&QWP(0,$key_)); # load IV
392 &mov ($rounds,&DWP(240,$key));
393 &mov ($key_,$key); # backup $key
394 &mov ($rounds_,$rounds); # backup $rounds
395 &je (&label("cbc_decrypt"));
397 &movaps ($inout0,$ivec);
399 &jb (&label("cbc_enc_tail"));
401 &jmp (&label("cbc_enc_loop"));
403 &set_label("cbc_enc_loop",16);
404 &movups ($ivec,&QWP(0,$inp));
405 &lea ($inp,&DWP(16,$inp));
406 &pxor ($inout0,$ivec);
408 { &aesni_inline_generate1("enc"); }
410 { &call ("_aesni_encrypt1"); }
412 &lea ($out,&DWP(16,$out));
413 &mov ($rounds,$rounds_); # restore $rounds
414 &mov ($key,$key_); # restore $key
415 &movups (&QWP(-16,$out),$inout0);
416 &jnc (&label("cbc_enc_loop"));
418 &jnz (&label("cbc_enc_tail"));
419 &movaps ($ivec,$inout0);
420 &jmp (&label("cbc_ret"));
422 &set_label("cbc_enc_tail");
423 &mov ("ecx",$len); # zaps $rounds
424 &data_word(0xA4F3F689); # rep movsb
425 &mov ("ecx",16); # zero tail
427 &xor ("eax","eax"); # zaps $len
428 &data_word(0xAAF3F689); # rep stosb
429 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
430 &mov ($rounds,$rounds_); # restore $rounds
431 &mov ($inp,$out); # $inp and $out are the same
432 &mov ($key,$key_); # restore $key
433 &jmp (&label("cbc_enc_loop"));
435 &set_label("cbc_decrypt",16);
437 &jbe (&label("cbc_dec_tail"));
438 &jmp (&label("cbc_dec_loop3"));
440 &set_label("cbc_dec_loop3",16);
441 &movups ($inout0,&QWP(0,$inp));
442 &movups ($inout1,&QWP(0x10,$inp));
443 &movups ($inout2,&QWP(0x20,$inp));
444 &movaps ($in0,$inout0);
445 &movaps ($in1,$inout1);
446 &call ("_aesni_decrypt3");
448 &lea ($inp,&DWP(0x30,$inp));
449 &lea ($out,&DWP(0x30,$out));
450 &pxor ($inout0,$ivec);
451 &pxor ($inout1,$in0);
452 &movups ($ivec,&QWP(-0x10,$inp));
453 &pxor ($inout2,$in1);
454 &movups (&QWP(-0x30,$out),$inout0);
455 &mov ($rounds,$rounds_) # restore $rounds
456 &movups (&QWP(-0x20,$out),$inout1);
457 &mov ($key,$key_); # restore $key
458 &movups (&QWP(-0x10,$out),$inout2);
459 &ja (&label("cbc_dec_loop3"));
461 &set_label("cbc_dec_tail");
463 &jz (&label("cbc_ret"));
465 &movups ($inout0,&QWP(0,$inp));
467 &movaps ($in0,$inout0);
468 &jbe (&label("cbc_dec_one"));
469 &movups ($inout1,&QWP(0x10,$inp));
471 &movaps ($in1,$inout1);
472 &jbe (&label("cbc_dec_two"));
473 &movups ($inout2,&QWP(0x20,$inp));
475 &jbe (&label("cbc_dec_three"));
476 &movups ($inout3,&QWP(0x30,$inp));
477 &call ("_aesni_decrypt4");
478 &movups ($rndkey0,&QWP(0x10,$inp));
479 &movups ($rndkey1,&QWP(0x20,$inp));
480 &pxor ($inout0,$ivec);
481 &pxor ($inout1,$in0);
482 &movups ($ivec,&QWP(0x30,$inp));
483 &movups (&QWP(0,$out),$inout0);
484 &pxor ($inout2,$rndkey0);
485 &pxor ($inout3,$rndkey1);
486 &movups (&QWP(0x10,$out),$inout1);
487 &movups (&QWP(0x20,$out),$inout2);
488 &movaps ($inout0,$inout3);
489 &lea ($out,&DWP(0x30,$out));
490 &jmp (&label("cbc_dec_tail_collected"));
492 &set_label("cbc_dec_one");
494 { &aesni_inline_generate1("dec"); }
496 { &call ("_aesni_decrypt1"); }
497 &pxor ($inout0,$ivec);
498 &movaps ($ivec,$in0);
499 &jmp (&label("cbc_dec_tail_collected"));
501 &set_label("cbc_dec_two");
502 &call ("_aesni_decrypt3");
503 &pxor ($inout0,$ivec);
504 &pxor ($inout1,$in0);
505 &movups (&QWP(0,$out),$inout0);
506 &movaps ($inout0,$inout1);
507 &movaps ($ivec,$in1);
508 &lea ($out,&DWP(0x10,$out));
509 &jmp (&label("cbc_dec_tail_collected"));
511 &set_label("cbc_dec_three");
512 &call ("_aesni_decrypt3");
513 &pxor ($inout0,$ivec);
514 &pxor ($inout1,$in0);
515 &pxor ($inout2,$in1);
516 &movups (&QWP(0,$out),$inout0);
517 &movups (&QWP(0x10,$out),$inout1);
518 &movaps ($inout0,$inout2);
519 &movups ($ivec,&QWP(0x20,$inp));
520 &lea ($out,&DWP(0x20,$out));
522 &set_label("cbc_dec_tail_collected");
524 &jnz (&label("cbc_dec_tail_partial"));
525 &movups (&QWP(0,$out),$inout0);
526 &jmp (&label("cbc_ret"));
528 &set_label("cbc_dec_tail_partial");
532 &movaps (&QWP(0,"esp"),$inout0);
535 &data_word(0xA4F3F689); # rep movsb
538 &set_label("cbc_ret");
539 &mov ($key_,&wparam(4));
540 &movups (&QWP(0,$key_),$ivec); # output IV
541 &function_end("${PREFIX}_cbc_encrypt");
543 # Mechanical port from aesni-x86_64.pl.
545 # _aesni_set_encrypt_key is private interface,
547 # "eax" const unsigned char *userKey
554 &function_begin_B("_aesni_set_encrypt_key");
556 &jz (&label("bad_pointer"));
558 &jz (&label("bad_pointer"));
560 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
561 &pxor ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
562 &lea ($key,&DWP(16,$key));
564 &je (&label("14rounds"));
566 &je (&label("12rounds"));
568 &jne (&label("bad_keybits"));
570 &set_label("10rounds",16);
572 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
573 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
574 &call (&label("key_128_cold"));
575 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
576 &call (&label("key_128"));
577 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
578 &call (&label("key_128"));
579 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
580 &call (&label("key_128"));
581 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
582 &call (&label("key_128"));
583 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
584 &call (&label("key_128"));
585 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
586 &call (&label("key_128"));
587 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
588 &call (&label("key_128"));
589 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
590 &call (&label("key_128"));
591 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
592 &call (&label("key_128"));
593 &$movekey (&QWP(0,$key),"xmm0");
594 &mov (&DWP(80,$key),$rounds);
598 &set_label("key_128",16);
599 &$movekey (&QWP(0,$key),"xmm0");
600 &lea ($key,&DWP(16,$key));
601 &set_label("key_128_cold");
602 &shufps ("xmm4","xmm0",0b00010000);
603 &pxor ("xmm0","xmm4");
604 &shufps ("xmm4","xmm0",0b10001100,);
605 &pxor ("xmm0","xmm4");
606 &pshufd ("xmm1","xmm1",0b11111111); # critical path
607 &pxor ("xmm0","xmm1");
610 &set_label("12rounds",16);
611 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
613 &$movekey (&QWP(-16,$key),"xmm0") # round 0
614 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
615 &call (&label("key_192a_cold"));
616 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
617 &call (&label("key_192b"));
618 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
619 &call (&label("key_192a"));
620 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
621 &call (&label("key_192b"));
622 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
623 &call (&label("key_192a"));
624 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
625 &call (&label("key_192b"));
626 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
627 &call (&label("key_192a"));
628 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
629 &call (&label("key_192b"));
630 &$movekey (&QWP(0,$key),"xmm0");
631 &mov (&DWP(48,$key),$rounds);
635 &set_label("key_192a",16);
636 &$movekey (&QWP(0,$key),"xmm0");
637 &lea ($key,&DWP(16,$key));
638 &set_label("key_192a_cold",16);
639 &movaps ("xmm5","xmm2");
640 &set_label("key_192b_warm");
641 &shufps ("xmm4","xmm0",0b00010000);
642 &movaps ("xmm3","xmm2");
643 &pxor ("xmm0","xmm4");
644 &shufps ("xmm4","xmm0",0b10001100);
646 &pxor ("xmm0","xmm4");
647 &pshufd ("xmm1","xmm1",0b01010101); # critical path
648 &pxor ("xmm2","xmm3");
649 &pxor ("xmm0","xmm1");
650 &pshufd ("xmm3","xmm0",0b11111111);
651 &pxor ("xmm2","xmm3");
654 &set_label("key_192b",16);
655 &movaps ("xmm3","xmm0");
656 &shufps ("xmm5","xmm0",0b01000100);
657 &$movekey (&QWP(0,$key),"xmm5");
658 &shufps ("xmm3","xmm2",0b01001110);
659 &$movekey (&QWP(16,$key),"xmm3");
660 &lea ($key,&DWP(32,$key));
661 &jmp (&label("key_192b_warm"));
663 &set_label("14rounds",16);
664 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
666 &lea ($key,&DWP(16,$key));
667 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
668 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
669 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
670 &call (&label("key_256a_cold"));
671 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
672 &call (&label("key_256b"));
673 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
674 &call (&label("key_256a"));
675 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
676 &call (&label("key_256b"));
677 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
678 &call (&label("key_256a"));
679 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
680 &call (&label("key_256b"));
681 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
682 &call (&label("key_256a"));
683 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
684 &call (&label("key_256b"));
685 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
686 &call (&label("key_256a"));
687 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
688 &call (&label("key_256b"));
689 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
690 &call (&label("key_256a"));
691 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
692 &call (&label("key_256b"));
693 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
694 &call (&label("key_256a"));
695 &$movekey (&QWP(0,$key),"xmm0");
696 &mov (&DWP(16,$key),$rounds);
700 &set_label("key_256a",16);
701 &$movekey (&QWP(0,$key),"xmm2");
702 &lea ($key,&DWP(16,$key));
703 &set_label("key_256a_cold");
704 &shufps ("xmm4","xmm0",0b00010000);
705 &pxor ("xmm0","xmm4");
706 &shufps ("xmm4","xmm0",0b10001100);
707 &pxor ("xmm0","xmm4");
708 &pshufd ("xmm1","xmm1",0b11111111); # critical path
709 &pxor ("xmm0","xmm1");
712 &set_label("key_256b",16);
713 &$movekey (&QWP(0,$key),"xmm0");
714 &lea ($key,&DWP(16,$key));
716 &shufps ("xmm4","xmm2",0b00010000);
717 &pxor ("xmm2","xmm4");
718 &shufps ("xmm4","xmm2",0b10001100);
719 &pxor ("xmm2","xmm4");
720 &pshufd ("xmm1","xmm1",0b10101010); # critical path
721 &pxor ("xmm2","xmm1");
724 &set_label("bad_pointer",4);
727 &set_label("bad_keybits",4);
730 &function_end_B("_aesni_set_encrypt_key");
732 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
734 &function_begin_B("${PREFIX}_set_encrypt_key");
735 &mov ("eax",&wparam(0));
736 &mov ($rounds,&wparam(1));
737 &mov ($key,&wparam(2));
738 &call ("_aesni_set_encrypt_key");
740 &function_end_B("${PREFIX}_set_encrypt_key");
742 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
744 &function_begin_B("${PREFIX}_set_decrypt_key");
745 &mov ("eax",&wparam(0));
746 &mov ($rounds,&wparam(1));
747 &mov ($key,&wparam(2));
748 &call ("_aesni_set_encrypt_key");
749 &mov ($key,&wparam(2));
750 &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key
752 &jnz (&label("dec_key_ret"));
753 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
755 &$movekey ("xmm0",&QWP(0,$key)); # just swap
756 &$movekey ("xmm1",&QWP(0,"eax"));
757 &$movekey (&QWP(0,"eax"),"xmm0");
758 &$movekey (&QWP(0,$key),"xmm1");
759 &lea ($key,&DWP(16,$key));
760 &lea ("eax",&DWP(-16,"eax"));
762 &set_label("dec_key_inverse");
763 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
764 &$movekey ("xmm1",&QWP(0,"eax"));
765 &aesimc ("xmm0","xmm0");
766 &aesimc ("xmm1","xmm1");
767 &lea ($key,&DWP(16,$key));
768 &lea ("eax",&DWP(-16,"eax"));
770 &$movekey (&QWP(16,"eax"),"xmm0");
771 &$movekey (&QWP(-16,$key),"xmm1");
772 &ja (&label("dec_key_inverse"));
774 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
775 &aesimc ("xmm0","xmm0");
776 &$movekey (&QWP(0,$key),"xmm0");
778 &xor ("eax","eax"); # return success
779 &set_label("dec_key_ret");
781 &function_end_B("${PREFIX}_set_decrypt_key");
782 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");