3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # This module implements support for Intel AES-NI extension. In
11 # OpenSSL context it's used with Intel engine, but can also be used as
12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
15 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
16 # generates drop-in replacement for
17 # crypto/aes/asm/aes-586.pl:-)
19 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
20 push(@INC,"${dir}","${dir}../../perlasm");
23 &asm_init($ARGV[0],$0);
25 $movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups");
44 sub _aesni_generate1 # folded loop
47 &function_begin_B("_aesni_${p}rypt1");
48 &$movekey ($rndkey0,&QWP(0,$key));
49 &$movekey ($rndkey1,&QWP(16,$key));
50 &lea ($key,&DWP(16,$key));
51 &pxor ($inout0,$rndkey0);
53 &set_label("${p}1_loop",16);
54 eval"&aes${p} ($inout0,$rndkey1)";
56 &lea ($key,&DWP(16,$key));
57 &$movekey ($rndkey1,&QWP(0,$key));
58 &jnz (&label("${p}1_loop"));
59 eval"&aes${p}last ($inout0,$rndkey1)";
61 &function_end_B("_aesni_${p}rypt1");
64 sub aesni_generate1 # fully unrolled loop
67 &function_begin_B("_aesni_${p}rypt1");
68 &$movekey ($rndkey0,&QWP(0,$key));
69 &$movekey ($rndkey1,&QWP(0x10,$key));
71 &pxor ($inout0,$rndkey0);
72 &$movekey ($rndkey0,&QWP(0x20,$key));
73 &lea ($key,&DWP(0x30,$key));
74 &jb (&label("${p}128"));
75 &lea ($key,&DWP(0x20,$key));
76 &je (&label("${p}192"));
77 &lea ($key,&DWP(0x20,$key));
78 eval"&aes${p} ($inout0,$rndkey1)";
79 &$movekey ($rndkey1,&QWP(-0x40,$key));
80 eval"&aes${p} ($inout0,$rndkey0)";
81 &$movekey ($rndkey0,&QWP(-0x30,$key));
82 &set_label("${p}192");
83 eval"&aes${p} ($inout0,$rndkey1)";
84 &$movekey ($rndkey1,&QWP(-0x20,$key));
85 eval"&aes${p} ($inout0,$rndkey0)";
86 &$movekey ($rndkey0,&QWP(-0x10,$key));
87 &set_label("${p}128");
88 eval"&aes${p} ($inout0,$rndkey1)";
89 &$movekey ($rndkey1,&QWP(0,$key));
90 eval"&aes${p} ($inout0,$rndkey0)";
91 &$movekey ($rndkey0,&QWP(0x10,$key));
92 eval"&aes${p} ($inout0,$rndkey1)";
93 &$movekey ($rndkey1,&QWP(0x20,$key));
94 eval"&aes${p} ($inout0,$rndkey0)";
95 &$movekey ($rndkey0,&QWP(0x30,$key));
96 eval"&aes${p} ($inout0,$rndkey1)";
97 &$movekey ($rndkey1,&QWP(0x40,$key));
98 eval"&aes${p} ($inout0,$rndkey0)";
99 &$movekey ($rndkey0,&QWP(0x50,$key));
100 eval"&aes${p} ($inout0,$rndkey1)";
101 &$movekey ($rndkey1,&QWP(0x60,$key));
102 eval"&aes${p} ($inout0,$rndkey0)";
103 &$movekey ($rndkey0,&QWP(0x70,$key));
104 eval"&aes${p} ($inout0,$rndkey1)";
105 eval"&aes${p}last ($inout0,$rndkey0)";
107 &function_end_B("_aesni_${p}rypt1");
110 &aesni_generate1("enc");
111 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
112 &function_begin_B("${PREFIX}_encrypt");
113 &mov ("eax",&wparam(0));
114 &mov ($key,&wparam(2));
115 &movups ($inout0,&QWP(0,"eax"));
116 &mov ($rounds,&DWP(240,$key));
117 &mov ("eax",&wparam(1));
118 &call ("_aesni_encrypt1");
119 &movups (&QWP(0,"eax"),$inout0);
121 &function_end_B("${PREFIX}_encrypt");
123 &aesni_generate1("dec");
124 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
125 &function_begin_B("${PREFIX}_decrypt");
126 &mov ("eax",&wparam(0));
127 &mov ($key,&wparam(2));
128 &movups ($inout0,&QWP(0,"eax"));
129 &mov ($rounds,&DWP(240,$key));
130 &mov ("eax",&wparam(1));
131 &call ("_aesni_decrypt1");
132 &movups (&QWP(0,"eax"),$inout0);
134 &function_end_B("${PREFIX}_decrypt");
136 # _aesni_[en|de]crypt3 are private interfaces, 3 denotes interleave
137 # factor. Why 3x? Even though aes[enc|dec] latency is 6, it turned
138 # out that it can be scheduled only every *second* cycle. Thus 3x
139 # interleave is the one providing optimal utilization, i.e. when
140 # subroutine's throughput is virtually same as of non-interleaved
141 # subroutine for number of input blocks up to 3. This is why it
142 # handles even double-block inputs. Larger interleave factor would
143 # perform suboptimally on shorter inputs...
148 &function_begin_B("_aesni_${p}rypt3");
149 &$movekey ($rndkey0,&QWP(0,$key));
150 &$movekey ($rndkey1,&QWP(16,$key));
152 &lea ($key,&DWP(32,$key));
153 &pxor ($inout0,$rndkey0);
154 &pxor ($inout1,$rndkey0);
156 &pxor ($inout2,$rndkey0);
157 &jmp (&label("${p}3_loop"));
158 &set_label("${p}3_loop",16);
159 eval"&aes${p} ($inout0,$rndkey1)";
160 &$movekey ($rndkey0,&QWP(0,$key));
161 eval"&aes${p} ($inout1,$rndkey1)";
163 eval"&aes${p} ($inout2,$rndkey1)";
164 &$movekey ($rndkey1,&QWP(16,$key));
165 eval"&aes${p} ($inout0,$rndkey0)";
166 &lea ($key,&DWP(32,$key));
167 eval"&aes${p} ($inout1,$rndkey0)";
168 eval"&aes${p} ($inout2,$rndkey0)";
169 &jnz (&label("${p}3_loop"));
170 eval"&aes${p} ($inout0,$rndkey1)";
171 &$movekey ($rndkey0,&QWP(0,$key));
172 eval"&aes${p} ($inout1,$rndkey1)";
173 eval"&aes${p} ($inout2,$rndkey1)";
174 eval"&aes${p}last ($inout0,$rndkey0)";
175 eval"&aes${p}last ($inout1,$rndkey0)";
176 eval"&aes${p}last ($inout2,$rndkey0)";
178 &function_end_B("_aesni_${p}rypt3");
180 &aesni_generate3("enc") if ($PREFIX eq "aesni");
181 &aesni_generate3("dec");
183 if ($PREFIX eq "aesni") {
184 # void aesni_ecb_encrypt (const void *in, void *out,
185 # size_t length, const AES_KEY *key,
188 &function_begin("aesni_ecb_encrypt");
189 &mov ($inp,&wparam(0));
190 &mov ($out,&wparam(1));
191 &mov ($len,&wparam(2));
192 &mov ($key,&wparam(3));
193 &mov ($rounds,&wparam(4));
195 &jb (&label("ecb_ret"));
197 &test ($rounds,$rounds)
198 &mov ($rounds,&DWP(240,$key));
199 &mov ($key_,$key); # backup $key
200 &mov ($rounds_,$rounds); # backup $rounds
201 &jz (&label("ecb_decrypt"));
204 &jc (&label("ecb_enc_tail"));
205 jmp (&label("ecb_enc_loop3"));
207 &set_label("ecb_enc_loop3",16);
208 &movups ($inout0,&QWP(0,$inp));
209 &movups ($inout1,&QWP(0x10,$inp));
210 &movups ($inout2,&QWP(0x20,$inp));
211 &lea ($inp,&DWP(0x30,$inp));
212 &call ("_aesni_encrypt3");
213 &movups (&QWP(0,$out),$inout0);
215 &movups (&QWP(0x10,$out),$inout0);
216 &mov ($key,$key_); # restore $key
217 &movups (&QWP(0x20,$out),$inout0);
218 &mov ($rounds,$rounds_); # restore $rounds
219 &lea ($out,&DWP(0x30,$out));
220 &jnc (&label("ecb_enc_loop3"));
222 &set_label("ecb_enc_tail");
224 &jz (&label("ecb_ret"));
227 &movups ($inout0,&QWP(0,$inp));
228 je (&label("ecb_enc_one"));
229 &movups ($inout1,&QWP(0x10,$inp));
230 &call ("_aesni_encrypt3");
231 &movups (&QWP(0,$out),$inout0);
232 &movups (&QWP(0x10,$out),$inout1);
233 jmp (&label("ecb_ret"));
235 &set_label("ecb_enc_one",16);
236 &call ("_aesni_encrypt1");
237 &movups (&QWP(0,$out),$inout0);
238 &jmp (&label("ecb_ret"));
240 &set_label("ecb_decrypt",16);
242 &jc (&label("ecb_dec_tail"));
243 jmp (&label("ecb_dec_loop3"));
245 &set_label("ecb_dec_loop3",16);
246 &movups ($inout0,&QWP(0,$inp));
247 &movups ($inout1,&QWP(0x10,$inp));
248 &movups ($inout2,&QWP(0x20,$inp));
249 &call ("_aesni_decrypt3");
250 &movups (&QWP(0,$out),$inout0);
252 &lea ($inp,&DWP(0x30,$inp));
253 &movups (&QWP(0x10,$out),$inout0);
254 &mov ($key,$key_); # restore $key
255 &movups (&QWP(0x20,$out),$inout0);
256 &mov ($rounds,$rounds_); # restore $rounds
257 &lea ($out,&DWP(0x30,$out));
258 &jnc (&label("ecb_dec_loop3"));
260 &set_label("ecb_dec_tail");
262 &jz (&label("ecb_ret"));
265 &movups ($inout0,&QWP(0,$inp));
266 je (&label("ecb_dec_one"));
267 &movups ($inout1,&QWP(0x10,$inp));
268 &call ("_aesni_decrypt3");
269 &movups (&QWP(0,$out),$inout0);
270 &movups (&QWP(0x10,$out),$inout1);
271 jmp (&label("ecb_ret"));
273 &set_label("ecb_dec_one",16);
274 &call ("_aesni_decrypt1");
275 &movups (&QWP(0,$out),$inout0);
277 &set_label("ecb_ret");
278 &function_end("aesni_ecb_encrypt");
281 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
282 # size_t length, const AES_KEY *key,
283 # unsigned char *ivp,const int enc);
284 &function_begin("${PREFIX}_cbc_encrypt");
285 &mov ($inp,&wparam(0));
286 &mov ($out,&wparam(1));
287 &mov ($len,&wparam(2));
288 &mov ($key,&wparam(3));
290 &mov ($key_,&wparam(4));
291 &je (&label("cbc_ret"));
294 &movups ($ivec,&QWP(0,$key_)); # load IV
295 &mov ($rounds,&DWP(240,$key));
296 &mov ($key_,$key); # backup $key
297 &mov ($rounds_,$rounds); # backup $rounds
298 &je (&label("cbc_decrypt"));
300 &movaps ($inout0,$ivec);
302 &jb (&label("cbc_enc_tail"));
304 &jmp (&label("cbc_enc_loop"));
306 &set_label("cbc_enc_loop",16);
307 &movups ($ivec,&QWP(0,$inp));
308 &lea ($inp,&DWP(16,$inp));
309 &pxor ($inout0,$ivec);
310 &call ("_aesni_encrypt1");
312 &mov ($rounds,$rounds_); # restore $rounds
313 &mov ($key,$key_); # restore $key
314 &movups (&QWP(0,$out),$inout0);
315 &lea ($out,&DWP(16,$out));
316 &jnc (&label("cbc_enc_loop"));
318 &jnz (&label("cbc_enc_tail"));
319 &movaps ($ivec,$inout0);
320 &jmp (&label("cbc_ret"));
322 &set_label("cbc_enc_tail");
323 &mov ("ecx",$len); # zaps $rounds
324 &data_word(0xA4F3F689); # rep movsb
325 &mov ("ecx",16); # zero tail
327 &xor ("eax","eax"); # zaps $len
328 &data_word(0xAAF3F689); # rep stosb
329 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
330 &mov ($rounds,$rounds_); # restore $rounds
331 &mov ($inp,$out); # $inp and $out are the same
332 &mov ($key,$key_); # restore $key
333 &jmp (&label("cbc_enc_loop"));
335 &set_label("cbc_decrypt",16);
337 &jc (&label("cbc_dec_tail"));
338 &jmp (&label("cbc_dec_loop3"));
340 &set_label("cbc_dec_loop3",16);
341 &movups ($inout0,&QWP(0,$inp));
342 &movups ($inout1,&QWP(0x10,$inp));
343 &movups ($inout2,&QWP(0x20,$inp));
344 &movaps ($in0,$inout0);
345 &movaps ($in1,$inout1);
346 &call ("_aesni_decrypt3");
348 &lea ($inp,&DWP(0x30,$inp));
349 &pxor ($inout0,$ivec);
350 &pxor ($inout1,$in0);
351 &movups ($ivec,&QWP(0x20,$inp));
352 &pxor ($inout2,$in1);
353 &movups (&QWP(0,$out),$inout0);
354 &mov ($rounds,$rounds_) # restore $rounds
355 &movups (&QWP(0x10,$out),$inout1);
356 &mov ($key,$key_); # restore $key
357 &movups (&QWP(0x20,$out),$inout2);
358 &lea ($out,&DWP(0x30,$out));
359 &jnc (&label("cbc_dec_loop3"));
361 &set_label("cbc_dec_tail");
363 &jz (&label("cbc_ret"));
365 &movups ($inout0,&QWP(0,$inp));
367 &movaps ($in0,$inout0);
368 &jbe (&label("cbc_dec_one"));
369 &movups ($inout1,&QWP(0x10,$inp));
371 &movaps ($in1,$inout1);
372 &jbe (&label("cbc_dec_two"));
373 &movups ($inout2,&QWP(0x20,$inp));
374 &call ("_aesni_decrypt3");
375 &pxor ($inout0,$ivec);
376 &movups ($ivec,&QWP(0x20,$inp));
377 &pxor ($inout1,$in0);
378 &pxor ($inout2,$in1);
379 &movups (&QWP(0,$out),$inout0);
380 &movups (&QWP(0x10,$out),$inout1);
381 &movaps ($inout0,$inout2);
382 &lea ($out,&DWP(0x20,$out));
383 &jmp (&label("cbc_dec_tail_collected"));
385 &set_label("cbc_dec_one");
386 &call ("_aesni_decrypt1");
387 &pxor ($inout0,$ivec);
388 &movaps ($ivec,$in0);
389 &jmp (&label("cbc_dec_tail_collected"));
391 &set_label("cbc_dec_two");
392 &call ("_aesni_decrypt3");
393 &pxor ($inout0,$ivec);
394 &pxor ($inout1,$in0);
395 &movups (&QWP(0,$out),$inout0);
396 &movaps ($inout0,$inout1);
397 &movaps ($ivec,$in1);
398 &lea ($out,&DWP(0x10,$out));
400 &set_label("cbc_dec_tail_collected");
402 &jnz (&label("cbc_dec_tail_partial"));
403 &movups (&QWP(0,$out),$inout0);
404 &jmp (&label("cbc_ret"));
406 &set_label("cbc_dec_tail_partial");
410 &movaps (&QWP(0,"esp"),$inout0);
413 &data_word(0xA4F3F689); # rep movsb
416 &set_label("cbc_ret");
417 &mov ($key_,&wparam(4));
418 &movups (&QWP(0,$key_),$ivec); # output IV
419 &function_end("${PREFIX}_cbc_encrypt");
421 # Mechanical port from aesni-x86_64.pl.
423 # _aesni_set_encrypt_key is private interface,
425 # "eax" const unsigned char *userKey
432 &function_begin_B("_aesni_set_encrypt_key");
434 &jz (&label("bad_pointer"));
436 &jz (&label("bad_pointer"));
438 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
439 &pxor ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
440 &lea ($key,&DWP(16,$key));
442 &je (&label("14rounds"));
444 &je (&label("12rounds"));
446 &jne (&label("bad_keybits"));
448 &set_label("10rounds",16);
450 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
451 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
452 &call (&label("key_128_cold"));
453 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
454 &call (&label("key_128"));
455 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
456 &call (&label("key_128"));
457 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
458 &call (&label("key_128"));
459 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
460 &call (&label("key_128"));
461 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
462 &call (&label("key_128"));
463 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
464 &call (&label("key_128"));
465 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
466 &call (&label("key_128"));
467 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
468 &call (&label("key_128"));
469 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
470 &call (&label("key_128"));
471 &$movekey (&QWP(0,$key),"xmm0");
472 &mov (&DWP(80,$key),$rounds);
476 &set_label("key_128",16);
477 &$movekey (&QWP(0,$key),"xmm0");
478 &lea ($key,&DWP(16,$key));
479 &set_label("key_128_cold");
480 &shufps ("xmm4","xmm0",0b00010000);
481 &pxor ("xmm0","xmm4");
482 &shufps ("xmm4","xmm0",0b10001100,);
483 &pxor ("xmm0","xmm4");
484 &pshufd ("xmm1","xmm1",0b11111111); # critical path
485 &pxor ("xmm0","xmm1");
488 &set_label("12rounds",16);
489 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
491 &$movekey (&QWP(-16,$key),"xmm0") # round 0
492 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
493 &call (&label("key_192a_cold"));
494 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
495 &call (&label("key_192b"));
496 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
497 &call (&label("key_192a"));
498 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
499 &call (&label("key_192b"));
500 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
501 &call (&label("key_192a"));
502 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
503 &call (&label("key_192b"));
504 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
505 &call (&label("key_192a"));
506 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
507 &call (&label("key_192b"));
508 &$movekey (&QWP(0,$key),"xmm0");
509 &mov (&DWP(48,$key),$rounds);
513 &set_label("key_192a",16);
514 &$movekey (&QWP(0,$key),"xmm0");
515 &lea ($key,&DWP(16,$key));
516 &set_label("key_192a_cold",16);
517 &movaps ("xmm5","xmm2");
518 &set_label("key_192b_warm");
519 &shufps ("xmm4","xmm0",0b00010000);
520 &movaps ("xmm3","xmm2");
521 &pxor ("xmm0","xmm4");
522 &shufps ("xmm4","xmm0",0b10001100);
524 &pxor ("xmm0","xmm4");
525 &pshufd ("xmm1","xmm1",0b01010101); # critical path
526 &pxor ("xmm2","xmm3");
527 &pxor ("xmm0","xmm1");
528 &pshufd ("xmm3","xmm0",0b11111111);
529 &pxor ("xmm2","xmm3");
532 &set_label("key_192b",16);
533 &movaps ("xmm3","xmm0");
534 &shufps ("xmm5","xmm0",0b01000100);
535 &$movekey (&QWP(0,$key),"xmm5");
536 &shufps ("xmm3","xmm2",0b01001110);
537 &$movekey (&QWP(16,$key),"xmm3");
538 &lea ($key,&DWP(32,$key));
539 &jmp (&label("key_192b_warm"));
541 &set_label("14rounds",16);
542 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
544 &lea ($key,&DWP(16,$key));
545 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
546 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
547 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
548 &call (&label("key_256a_cold"));
549 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
550 &call (&label("key_256b"));
551 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
552 &call (&label("key_256a"));
553 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
554 &call (&label("key_256b"));
555 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
556 &call (&label("key_256a"));
557 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
558 &call (&label("key_256b"));
559 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
560 &call (&label("key_256a"));
561 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
562 &call (&label("key_256b"));
563 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
564 &call (&label("key_256a"));
565 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
566 &call (&label("key_256b"));
567 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
568 &call (&label("key_256a"));
569 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
570 &call (&label("key_256b"));
571 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
572 &call (&label("key_256a"));
573 &$movekey (&QWP(0,$key),"xmm0");
574 &mov (&DWP(16,$key),$rounds);
578 &set_label("key_256a",16);
579 &$movekey (&QWP(0,$key),"xmm2");
580 &lea ($key,&DWP(16,$key));
581 &set_label("key_256a_cold");
582 &shufps ("xmm4","xmm0",0b00010000);
583 &pxor ("xmm0","xmm4");
584 &shufps ("xmm4","xmm0",0b10001100);
585 &pxor ("xmm0","xmm4");
586 &pshufd ("xmm1","xmm1",0b11111111); # critical path
587 &pxor ("xmm0","xmm1");
590 &set_label("key_256b",16);
591 &$movekey (&QWP(0,$key),"xmm0");
592 &lea ($key,&DWP(16,$key));
594 &shufps ("xmm4","xmm2",0b00010000);
595 &pxor ("xmm2","xmm4");
596 &shufps ("xmm4","xmm2",0b10001100);
597 &pxor ("xmm2","xmm4");
598 &pshufd ("xmm1","xmm1",0b10101010); # critical path
599 &pxor ("xmm2","xmm1");
602 &set_label("bad_pointer",4);
605 &set_label("bad_keybits",4);
608 &function_end_B("_aesni_set_encrypt_key");
610 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
612 &function_begin_B("${PREFIX}_set_encrypt_key");
613 &mov ("eax",&wparam(0));
614 &mov ($rounds,&wparam(1));
615 &mov ($key,&wparam(2));
616 &call ("_aesni_set_encrypt_key");
618 &function_end_B("${PREFIX}_set_encrypt_key");
620 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
622 &function_begin_B("${PREFIX}_set_decrypt_key");
623 &mov ("eax",&wparam(0));
624 &mov ($rounds,&wparam(1));
625 &mov ($key,&wparam(2));
626 &call ("_aesni_set_encrypt_key");
627 &mov ($key,&wparam(2));
628 &shl ($rounds,4) # actually rounds after _aesni_set_encrypt_key
630 &jnz (&label("dec_key_ret"));
631 &lea ("eax",&DWP(0,$key,$rounds)); # end of key schedule
633 &$movekey ("xmm0",&QWP(0,$key)); # just swap
634 &$movekey ("xmm1",&QWP(0,"eax"));
635 &$movekey (&QWP(0,"eax"),"xmm0");
636 &$movekey (&QWP(0,$key),"xmm1");
637 &lea ($key,&DWP(16,$key));
638 &lea ("eax",&DWP(-16,"eax"));
639 &jmp (&label("dec_key_inverse"));
641 &set_label("dec_key_inverse",16);
642 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
643 &$movekey ("xmm1",&QWP(0,"eax"));
644 &aesimc ("xmm0","xmm0");
645 &aesimc ("xmm1","xmm1");
646 &lea ($key,&DWP(16,$key));
647 &lea ("eax",&DWP(-16,"eax"));
649 &$movekey (&QWP(16,"eax"),"xmm0");
650 &$movekey (&QWP(-16,$key),"xmm1");
651 &ja (&label("dec_key_inverse"));
653 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
654 &aesimc ("xmm0","xmm0");
655 &$movekey (&QWP(0,$key),"xmm0");
657 &xor ("eax","eax"); # return success
658 &set_label("dec_key_ret");
660 &function_end_B("${PREFIX}_set_decrypt_key");
661 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");