aesni-x86.pl: eliminate development comments.
[oweals/openssl.git] / crypto / aes / asm / aesni-x86.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements support for Intel AES-NI extension. In
11 # OpenSSL context it's used with Intel engine, but can also be used as
12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
13 # details].
14
15 $PREFIX="aesni";        # if $PREFIX is set to "AES", the script
16                         # generates drop-in replacement for
17                         # crypto/aes/asm/aes-586.pl:-)
18 $inline=1;              # inline _aesni_[en|de]crypt
19
20 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
21 push(@INC,"${dir}","${dir}../../perlasm");
22 require "x86asm.pl";
23
24 &asm_init($ARGV[0],$0);
25
26 $movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups");
27
28 $len="eax";
29 $rounds="ecx";
30 $key="edx";
31 $inp="esi";
32 $out="edi";
33 $rounds_="ebx"; # backup copy for $rounds
34 $key_="ebp";    # backup copy for $key
35
36 $inout0="xmm0";
37 $inout1="xmm1";
38 $inout2="xmm2";
39 $rndkey0="xmm3";
40 $rndkey1="xmm4";
41 $ivec="xmm5";
42 $in0="xmm6";
43 $in1="xmm7";    $inout3="xmm7";
44
45 # Inline version of internal aesni_[en|de]crypt1
46 sub aesni_inline_generate1
47 { my $p=shift;
48
49     &$movekey           ($rndkey0,&QWP(0,$key));
50     &$movekey           ($rndkey1,&QWP(16,$key));
51     &lea                ($key,&DWP(32,$key));
52     &pxor               ($inout0,$rndkey0);
53     &set_label("${p}1_loop");
54         eval"&aes${p}   ($inout0,$rndkey1)";
55         &dec            ($rounds);
56         &$movekey       ($rndkey1,&QWP(0,$key));
57         &lea            ($key,&DWP(16,$key));
58     &jnz                (&label("${p}1_loop"));
59     eval"&aes${p}last   ($inout0,$rndkey1)";
60 }
61
62 sub aesni_generate1     # fully unrolled loop
63 { my $p=shift;
64
65     &function_begin_B("_aesni_${p}rypt1");
66         &$movekey       ($rndkey0,&QWP(0,$key));
67         &$movekey       ($rndkey1,&QWP(0x10,$key));
68         &cmp            ($rounds,11);
69         &pxor           ($inout0,$rndkey0);
70         &$movekey       ($rndkey0,&QWP(0x20,$key));
71         &lea            ($key,&DWP(0x30,$key));
72         &jb             (&label("${p}128"));
73         &lea            ($key,&DWP(0x20,$key));
74         &je             (&label("${p}192"));
75         &lea            ($key,&DWP(0x20,$key));
76         eval"&aes${p}   ($inout0,$rndkey1)";
77         &$movekey       ($rndkey1,&QWP(-0x40,$key));
78         eval"&aes${p}   ($inout0,$rndkey0)";
79         &$movekey       ($rndkey0,&QWP(-0x30,$key));
80     &set_label("${p}192");
81         eval"&aes${p}   ($inout0,$rndkey1)";
82         &$movekey       ($rndkey1,&QWP(-0x20,$key));
83         eval"&aes${p}   ($inout0,$rndkey0)";
84         &$movekey       ($rndkey0,&QWP(-0x10,$key));
85     &set_label("${p}128");
86         eval"&aes${p}   ($inout0,$rndkey1)";
87         &$movekey       ($rndkey1,&QWP(0,$key));
88         eval"&aes${p}   ($inout0,$rndkey0)";
89         &$movekey       ($rndkey0,&QWP(0x10,$key));
90         eval"&aes${p}   ($inout0,$rndkey1)";
91         &$movekey       ($rndkey1,&QWP(0x20,$key));
92         eval"&aes${p}   ($inout0,$rndkey0)";
93         &$movekey       ($rndkey0,&QWP(0x30,$key));
94         eval"&aes${p}   ($inout0,$rndkey1)";
95         &$movekey       ($rndkey1,&QWP(0x40,$key));
96         eval"&aes${p}   ($inout0,$rndkey0)";
97         &$movekey       ($rndkey0,&QWP(0x50,$key));
98         eval"&aes${p}   ($inout0,$rndkey1)";
99         &$movekey       ($rndkey1,&QWP(0x60,$key));
100         eval"&aes${p}   ($inout0,$rndkey0)";
101         &$movekey       ($rndkey0,&QWP(0x70,$key));
102         eval"&aes${p}   ($inout0,$rndkey1)";
103     eval"&aes${p}last   ($inout0,$rndkey0)";
104     &ret();
105     &function_end_B("_aesni_${p}rypt1");
106 }
107
108 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
109 &aesni_generate1("enc") if (!$inline);
110 &function_begin_B("${PREFIX}_encrypt");
111         &mov    ("eax",&wparam(0));
112         &mov    ($key,&wparam(2));
113         &movups ($inout0,&QWP(0,"eax"));
114         &mov    ($rounds,&DWP(240,$key));
115         &mov    ("eax",&wparam(1));
116         if ($inline)
117         {   &aesni_inline_generate1("enc");     }
118         else
119         {   &call       ("_aesni_encrypt1");    }
120         &movups (&QWP(0,"eax"),$inout0);
121         &ret    ();
122 &function_end_B("${PREFIX}_encrypt");
123
124 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
125 &aesni_generate1("dec") if(!$inline);
126 &function_begin_B("${PREFIX}_decrypt");
127         &mov    ("eax",&wparam(0));
128         &mov    ($key,&wparam(2));
129         &movups ($inout0,&QWP(0,"eax"));
130         &mov    ($rounds,&DWP(240,$key));
131         &mov    ("eax",&wparam(1));
132         if ($inline)
133         {   &aesni_inline_generate1("dec");     }
134         else
135         {   &call       ("_aesni_decrypt1");    }
136         &movups (&QWP(0,"eax"),$inout0);
137         &ret    ();
138 &function_end_B("${PREFIX}_decrypt");
139 \f
140 # _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
141 # factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
142 # latency is 6, it turned out that it can be scheduled only every
143 # *second* cycle. Thus 3x interleave is the one providing optimal
144 # utilization, i.e. when subroutine's throughput is virtually same as
145 # of non-interleaved subroutine [for number of input blocks up to 3].
146 # This is why it makes no sense to implement 2x subroutine. As soon
147 # as/if Intel improves throughput by making it possible to schedule
148 # the instructions in question *every* cycles I would have to
149 # implement 6x interleave and use it in loop...
150 sub aesni_generate3
151 { my $p=shift;
152
153     &function_begin_B("_aesni_${p}rypt3");
154         &$movekey       ($rndkey0,&QWP(0,$key));
155         &shr            ($rounds,1);
156         &$movekey       ($rndkey1,&QWP(16,$key));
157         &lea            ($key,&DWP(32,$key));
158         &pxor           ($inout0,$rndkey0);
159         &pxor           ($inout1,$rndkey0);
160         &pxor           ($inout2,$rndkey0);
161         &jmp            (&label("${p}3_loop"));
162     &set_label("${p}3_loop",16);
163         eval"&aes${p}   ($inout0,$rndkey1)";
164         &$movekey       ($rndkey0,&QWP(0,$key));
165         eval"&aes${p}   ($inout1,$rndkey1)";
166         &dec            ($rounds);
167         eval"&aes${p}   ($inout2,$rndkey1)";
168         &$movekey       ($rndkey1,&QWP(16,$key));
169         eval"&aes${p}   ($inout0,$rndkey0)";
170         &lea            ($key,&DWP(32,$key));
171         eval"&aes${p}   ($inout1,$rndkey0)";
172         eval"&aes${p}   ($inout2,$rndkey0)";
173         &jnz            (&label("${p}3_loop"));
174     eval"&aes${p}       ($inout0,$rndkey1)";
175     &$movekey           ($rndkey0,&QWP(0,$key));
176     eval"&aes${p}       ($inout1,$rndkey1)";
177     eval"&aes${p}       ($inout2,$rndkey1)";
178     eval"&aes${p}last   ($inout0,$rndkey0)";
179     eval"&aes${p}last   ($inout1,$rndkey0)";
180     eval"&aes${p}last   ($inout2,$rndkey0)";
181     &ret();
182     &function_end_B("_aesni_${p}rypt3");
183 }
184
185 # 4x interleave is implemented to improve small block performance,
186 # most notably [and naturally] 4 block by ~30%. One can argue that one
187 # should have implemented 5x as well, but improvement  would be <20%,
188 # so it's not worth it...
189 sub aesni_generate4
190 { my $p=shift;
191
192     &function_begin_B("_aesni_${p}rypt4");
193         &$movekey       ($rndkey0,&QWP(0,$key));
194         &$movekey       ($rndkey1,&QWP(16,$key));
195         &shr            ($rounds,1);
196         &lea            ($key,&DWP(32,$key));
197         &pxor           ($inout0,$rndkey0);
198         &pxor           ($inout1,$rndkey0);
199         &pxor           ($inout2,$rndkey0);
200         &pxor           ($inout3,$rndkey0);
201         &jmp            (&label("${p}3_loop"));
202     &set_label("${p}3_loop",16);
203         eval"&aes${p}   ($inout0,$rndkey1)";
204         &$movekey       ($rndkey0,&QWP(0,$key));
205         eval"&aes${p}   ($inout1,$rndkey1)";
206         &dec            ($rounds);
207         eval"&aes${p}   ($inout2,$rndkey1)";
208         eval"&aes${p}   ($inout3,$rndkey1)";
209         &$movekey       ($rndkey1,&QWP(16,$key));
210         eval"&aes${p}   ($inout0,$rndkey0)";
211         &lea            ($key,&DWP(32,$key));
212         eval"&aes${p}   ($inout1,$rndkey0)";
213         eval"&aes${p}   ($inout2,$rndkey0)";
214         eval"&aes${p}   ($inout3,$rndkey0)";
215         &jnz            (&label("${p}3_loop"));
216     eval"&aes${p}       ($inout0,$rndkey1)";
217     &$movekey           ($rndkey0,&QWP(0,$key));
218     eval"&aes${p}       ($inout1,$rndkey1)";
219     eval"&aes${p}       ($inout2,$rndkey1)";
220     eval"&aes${p}       ($inout3,$rndkey1)";
221     eval"&aes${p}last   ($inout0,$rndkey0)";
222     eval"&aes${p}last   ($inout1,$rndkey0)";
223     eval"&aes${p}last   ($inout2,$rndkey0)";
224     eval"&aes${p}last   ($inout3,$rndkey0)";
225     &ret();
226     &function_end_B("_aesni_${p}rypt4");
227 }
228 &aesni_generate3("enc") if ($PREFIX eq "aesni");
229 &aesni_generate3("dec");
230 &aesni_generate4("enc") if ($PREFIX eq "aesni");
231 &aesni_generate4("dec");
232
233 if ($PREFIX eq "aesni") {
234 # void aesni_ecb_encrypt (const void *in, void *out,
235 #                         size_t length, const AES_KEY *key,
236 #                         int enc);
237 &function_begin("aesni_ecb_encrypt");
238         &mov    ($inp,&wparam(0));
239         &mov    ($out,&wparam(1));
240         &mov    ($len,&wparam(2));
241         &mov    ($key,&wparam(3));
242         &mov    ($rounds,&wparam(4));
243         &cmp    ($len,16);
244         &jb     (&label("ecb_ret"));
245         &and    ($len,-16);
246         &test   ($rounds,$rounds)
247         &mov    ($rounds,&DWP(240,$key));
248         &mov    ($key_,$key);           # backup $key
249         &mov    ($rounds_,$rounds);     # backup $rounds
250         &jz     (&label("ecb_decrypt"));
251
252         &sub    ($len,0x40);
253         &jbe    (&label("ecb_enc_tail"));
254         &jmp    (&label("ecb_enc_loop3"));
255
256 &set_label("ecb_enc_loop3",16);
257         &movups ($inout0,&QWP(0,$inp));
258         &movups ($inout1,&QWP(0x10,$inp));
259         &movups ($inout2,&QWP(0x20,$inp));
260         &call   ("_aesni_encrypt3");
261         &sub    ($len,0x30);
262         &lea    ($inp,&DWP(0x30,$inp));
263         &lea    ($out,&DWP(0x30,$out));
264         &movups (&QWP(-0x30,$out),$inout0);
265         &mov    ($key,$key_);           # restore $key
266         &movups (&QWP(-0x20,$out),$inout1);
267         &mov    ($rounds,$rounds_);     # restore $rounds
268         &movups (&QWP(-0x10,$out),$inout2);
269         &ja     (&label("ecb_enc_loop3"));
270
271 &set_label("ecb_enc_tail");
272         &add    ($len,0x40);
273         &jz     (&label("ecb_ret"));
274
275         &cmp    ($len,0x10);
276         &movups ($inout0,&QWP(0,$inp));
277         &je     (&label("ecb_enc_one"));
278         &cmp    ($len,0x20);
279         &movups ($inout1,&QWP(0x10,$inp));
280         &je     (&label("ecb_enc_two"));
281         &cmp    ($len,0x30);
282         &movups ($inout2,&QWP(0x20,$inp));
283         &je     (&label("ecb_enc_three"));
284         &movups ($inout3,&QWP(0x30,$inp));
285         &call   ("_aesni_encrypt4");
286         &movups (&QWP(0,$out),$inout0);
287         &movups (&QWP(0x10,$out),$inout1);
288         &movups (&QWP(0x20,$out),$inout2);
289         &movups (&QWP(0x30,$out),$inout3);
290         jmp     (&label("ecb_ret"));
291
292 &set_label("ecb_enc_one",16);
293         if ($inline)
294         {   &aesni_inline_generate1("enc");     }
295         else
296         {   &call       ("_aesni_encrypt1");    }
297         &movups (&QWP(0,$out),$inout0);
298         &jmp    (&label("ecb_ret"));
299
300 &set_label("ecb_enc_two",16);
301         &call   ("_aesni_encrypt3");
302         &movups (&QWP(0,$out),$inout0);
303         &movups (&QWP(0x10,$out),$inout1);
304         &jmp    (&label("ecb_ret"));
305
306 &set_label("ecb_enc_three",16);
307         &call   ("_aesni_encrypt3");
308         &movups (&QWP(0,$out),$inout0);
309         &movups (&QWP(0x10,$out),$inout1);
310         &movups (&QWP(0x20,$out),$inout2);
311         &jmp    (&label("ecb_ret"));
312
313 &set_label("ecb_decrypt",16);
314         &sub    ($len,0x40);
315         &jbe    (&label("ecb_dec_tail"));
316         &jmp    (&label("ecb_dec_loop3"));
317
318 &set_label("ecb_dec_loop3",16);
319         &movups ($inout0,&QWP(0,$inp));
320         &movups ($inout1,&QWP(0x10,$inp));
321         &movups ($inout2,&QWP(0x20,$inp));
322         &call   ("_aesni_decrypt3");
323         &sub    ($len,0x30);
324         &lea    ($inp,&DWP(0x30,$inp));
325         &lea    ($out,&DWP(0x30,$out));
326         &movups (&QWP(-0x30,$out),$inout0);
327         &mov    ($key,$key_);           # restore $key
328         &movups (&QWP(-0x20,$out),$inout1);
329         &mov    ($rounds,$rounds_);     # restore $rounds
330         &movups (&QWP(-0x10,$out),$inout2);
331         &ja     (&label("ecb_dec_loop3"));
332
333 &set_label("ecb_dec_tail");
334         &add    ($len,0x40);
335         &jz     (&label("ecb_ret"));
336
337         &cmp    ($len,0x10);
338         &movups ($inout0,&QWP(0,$inp));
339         &je     (&label("ecb_dec_one"));
340         &cmp    ($len,0x20);
341         &movups ($inout1,&QWP(0x10,$inp));
342         &je     (&label("ecb_dec_two"));
343         &cmp    ($len,0x30);
344         &movups ($inout2,&QWP(0x20,$inp));
345         &je     (&label("ecb_dec_three"));
346         &movups ($inout3,&QWP(0x30,$inp));
347         &call   ("_aesni_decrypt4");
348         &movups (&QWP(0,$out),$inout0);
349         &movups (&QWP(0x10,$out),$inout1);
350         &movups (&QWP(0x20,$out),$inout2);
351         &movups (&QWP(0x30,$out),$inout3);
352         &jmp    (&label("ecb_ret"));
353
354 &set_label("ecb_dec_one",16);
355         if ($inline)
356         {   &aesni_inline_generate1("dec");     }
357         else
358         {   &call       ("_aesni_decrypt1");    }
359         &movups (&QWP(0,$out),$inout0);
360         &jmp    (&label("ecb_ret"));
361
362 &set_label("ecb_dec_two",16);
363         &call   ("_aesni_decrypt3");
364         &movups (&QWP(0,$out),$inout0);
365         &movups (&QWP(0x10,$out),$inout1);
366         &jmp    (&label("ecb_ret"));
367
368 &set_label("ecb_dec_three",16);
369         &call   ("_aesni_decrypt3");
370         &movups (&QWP(0,$out),$inout0);
371         &movups (&QWP(0x10,$out),$inout1);
372         &movups (&QWP(0x20,$out),$inout2);
373
374 &set_label("ecb_ret");
375 &function_end("aesni_ecb_encrypt");
376 }
377
378 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
379 #                           size_t length, const AES_KEY *key,
380 #                           unsigned char *ivp,const int enc);
381 &function_begin("${PREFIX}_cbc_encrypt");
382         &mov    ($inp,&wparam(0));
383         &mov    ($out,&wparam(1));
384         &mov    ($len,&wparam(2));
385         &mov    ($key,&wparam(3));
386         &test   ($len,$len);
387         &mov    ($key_,&wparam(4));
388         &jz     (&label("cbc_ret"));
389
390         &cmp    (&wparam(5),0);
391         &movups ($ivec,&QWP(0,$key_));  # load IV
392         &mov    ($rounds,&DWP(240,$key));
393         &mov    ($key_,$key);           # backup $key
394         &mov    ($rounds_,$rounds);     # backup $rounds
395         &je     (&label("cbc_decrypt"));
396
397         &movaps ($inout0,$ivec);
398         &cmp    ($len,16);
399         &jb     (&label("cbc_enc_tail"));
400         &sub    ($len,16);
401         &jmp    (&label("cbc_enc_loop"));
402
403 &set_label("cbc_enc_loop",16);
404         &movups ($ivec,&QWP(0,$inp));
405         &lea    ($inp,&DWP(16,$inp));
406         &pxor   ($inout0,$ivec);
407         if ($inline)
408         {   &aesni_inline_generate1("enc");     }
409         else
410         {   &call       ("_aesni_encrypt1");    }
411         &sub    ($len,16);
412         &lea    ($out,&DWP(16,$out));
413         &mov    ($rounds,$rounds_);     # restore $rounds
414         &mov    ($key,$key_);           # restore $key
415         &movups (&QWP(-16,$out),$inout0);
416         &jnc    (&label("cbc_enc_loop"));
417         &add    ($len,16);
418         &jnz    (&label("cbc_enc_tail"));
419         &movaps ($ivec,$inout0);
420         &jmp    (&label("cbc_ret"));
421
422 &set_label("cbc_enc_tail");
423         &mov    ("ecx",$len);           # zaps $rounds
424         &data_word(0xA4F3F689);         # rep movsb
425         &mov    ("ecx",16);             # zero tail
426         &sub    ("ecx",$len);
427         &xor    ("eax","eax");          # zaps $len
428         &data_word(0xAAF3F689);         # rep stosb
429         &lea    ($out,&DWP(-16,$out));  # rewind $out by 1 block
430         &mov    ($rounds,$rounds_);     # restore $rounds
431         &mov    ($inp,$out);            # $inp and $out are the same
432         &mov    ($key,$key_);           # restore $key
433         &jmp    (&label("cbc_enc_loop"));
434
435 &set_label("cbc_decrypt",16);
436         &sub    ($len,0x40);
437         &jbe    (&label("cbc_dec_tail"));
438         &jmp    (&label("cbc_dec_loop3"));
439
440 &set_label("cbc_dec_loop3",16);
441         &movups ($inout0,&QWP(0,$inp));
442         &movups ($inout1,&QWP(0x10,$inp));
443         &movups ($inout2,&QWP(0x20,$inp));
444         &movaps ($in0,$inout0);
445         &movaps ($in1,$inout1);
446         &call   ("_aesni_decrypt3");
447         &sub    ($len,0x30);
448         &lea    ($inp,&DWP(0x30,$inp));
449         &lea    ($out,&DWP(0x30,$out));
450         &pxor   ($inout0,$ivec);
451         &pxor   ($inout1,$in0);
452         &movups ($ivec,&QWP(-0x10,$inp));
453         &pxor   ($inout2,$in1);
454         &movups (&QWP(-0x30,$out),$inout0);
455         &mov    ($rounds,$rounds_)      # restore $rounds
456         &movups (&QWP(-0x20,$out),$inout1);
457         &mov    ($key,$key_);           # restore $key
458         &movups (&QWP(-0x10,$out),$inout2);
459         &ja     (&label("cbc_dec_loop3"));
460
461 &set_label("cbc_dec_tail");
462         &add    ($len,0x40);
463         &jz     (&label("cbc_ret"));
464
465         &movups ($inout0,&QWP(0,$inp));
466         &cmp    ($len,0x10);
467         &movaps ($in0,$inout0);
468         &jbe    (&label("cbc_dec_one"));
469         &movups ($inout1,&QWP(0x10,$inp));
470         &cmp    ($len,0x20);
471         &movaps ($in1,$inout1);
472         &jbe    (&label("cbc_dec_two"));
473         &movups ($inout2,&QWP(0x20,$inp));
474         &cmp    ($len,0x30);
475         &jbe    (&label("cbc_dec_three"));
476         &movups ($inout3,&QWP(0x30,$inp));
477         &call   ("_aesni_decrypt4");
478         &movups ($rndkey0,&QWP(0x10,$inp));
479         &movups ($rndkey1,&QWP(0x20,$inp));
480         &pxor   ($inout0,$ivec);
481         &pxor   ($inout1,$in0);
482         &movups ($ivec,&QWP(0x30,$inp));
483         &movups (&QWP(0,$out),$inout0);
484         &pxor   ($inout2,$rndkey0);
485         &pxor   ($inout3,$rndkey1);
486         &movups (&QWP(0x10,$out),$inout1);
487         &movups (&QWP(0x20,$out),$inout2);
488         &movaps ($inout0,$inout3);
489         &lea    ($out,&DWP(0x30,$out));
490         &jmp    (&label("cbc_dec_tail_collected"));
491
492 &set_label("cbc_dec_one");
493         if ($inline)
494         {   &aesni_inline_generate1("dec");     }
495         else
496         {   &call       ("_aesni_decrypt1");    }
497         &pxor   ($inout0,$ivec);
498         &movaps ($ivec,$in0);
499         &jmp    (&label("cbc_dec_tail_collected"));
500
501 &set_label("cbc_dec_two");
502         &call   ("_aesni_decrypt3");
503         &pxor   ($inout0,$ivec);
504         &pxor   ($inout1,$in0);
505         &movups (&QWP(0,$out),$inout0);
506         &movaps ($inout0,$inout1);
507         &movaps ($ivec,$in1);
508         &lea    ($out,&DWP(0x10,$out));
509         &jmp    (&label("cbc_dec_tail_collected"));
510
511 &set_label("cbc_dec_three");
512         &call   ("_aesni_decrypt3");
513         &pxor   ($inout0,$ivec);
514         &pxor   ($inout1,$in0);
515         &pxor   ($inout2,$in1);
516         &movups (&QWP(0,$out),$inout0);
517         &movups (&QWP(0x10,$out),$inout1);
518         &movaps ($inout0,$inout2);
519         &movups ($ivec,&QWP(0x20,$inp));
520         &lea    ($out,&DWP(0x20,$out));
521
522 &set_label("cbc_dec_tail_collected");
523         &and    ($len,15);
524         &jnz    (&label("cbc_dec_tail_partial"));
525         &movups (&QWP(0,$out),$inout0);
526         &jmp    (&label("cbc_ret"));
527
528 &set_label("cbc_dec_tail_partial");
529         &mov    ($key_,"esp");
530         &sub    ("esp",16);
531         &and    ("esp",-16);
532         &movaps (&QWP(0,"esp"),$inout0);
533         &mov    ($inp,"esp");
534         &mov    ("ecx",$len);
535         &data_word(0xA4F3F689);         # rep movsb
536         &mov    ("esp",$key_);
537
538 &set_label("cbc_ret");
539         &mov    ($key_,&wparam(4));
540         &movups (&QWP(0,$key_),$ivec);  # output IV
541 &function_end("${PREFIX}_cbc_encrypt");
542
543 # Mechanical port from aesni-x86_64.pl.
544 #
545 # _aesni_set_encrypt_key is private interface,
546 # input:
547 #       "eax"   const unsigned char *userKey
548 #       $rounds int bits
549 #       $key    AES_KEY *key
550 # output:
551 #       "eax"   return code
552 #       $round  rounds
553
554 &function_begin_B("_aesni_set_encrypt_key");
555         &test   ("eax","eax");
556         &jz     (&label("bad_pointer"));
557         &test   ($key,$key);
558         &jz     (&label("bad_pointer"));
559
560         &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
561         &pxor   ("xmm4","xmm4");        # low dword of xmm4 is assumed 0
562         &lea    ($key,&DWP(16,$key));
563         &cmp    ($rounds,256);
564         &je     (&label("14rounds"));
565         &cmp    ($rounds,192);
566         &je     (&label("12rounds"));
567         &cmp    ($rounds,128);
568         &jne    (&label("bad_keybits"));
569
570 &set_label("10rounds",16);
571         &mov            ($rounds,9);
572         &$movekey       (&QWP(-16,$key),"xmm0");        # round 0
573         &aeskeygenassist("xmm1","xmm0",0x01);           # round 1
574         &call           (&label("key_128_cold"));
575         &aeskeygenassist("xmm1","xmm0",0x2);            # round 2
576         &call           (&label("key_128"));
577         &aeskeygenassist("xmm1","xmm0",0x04);           # round 3
578         &call           (&label("key_128"));
579         &aeskeygenassist("xmm1","xmm0",0x08);           # round 4
580         &call           (&label("key_128"));
581         &aeskeygenassist("xmm1","xmm0",0x10);           # round 5
582         &call           (&label("key_128"));
583         &aeskeygenassist("xmm1","xmm0",0x20);           # round 6
584         &call           (&label("key_128"));
585         &aeskeygenassist("xmm1","xmm0",0x40);           # round 7
586         &call           (&label("key_128"));
587         &aeskeygenassist("xmm1","xmm0",0x80);           # round 8
588         &call           (&label("key_128"));
589         &aeskeygenassist("xmm1","xmm0",0x1b);           # round 9
590         &call           (&label("key_128"));
591         &aeskeygenassist("xmm1","xmm0",0x36);           # round 10
592         &call           (&label("key_128"));
593         &$movekey       (&QWP(0,$key),"xmm0");
594         &mov            (&DWP(80,$key),$rounds);
595         &xor            ("eax","eax");
596         &ret();
597
598 &set_label("key_128",16);
599         &$movekey       (&QWP(0,$key),"xmm0");
600         &lea            ($key,&DWP(16,$key));
601 &set_label("key_128_cold");
602         &shufps         ("xmm4","xmm0",0b00010000);
603         &pxor           ("xmm0","xmm4");
604         &shufps         ("xmm4","xmm0",0b10001100,);
605         &pxor           ("xmm0","xmm4");
606         &pshufd         ("xmm1","xmm1",0b11111111);     # critical path
607         &pxor           ("xmm0","xmm1");
608         &ret();
609
610 &set_label("12rounds",16);
611         &movq           ("xmm2",&QWP(16,"eax"));        # remaining 1/3 of *userKey
612         &mov            ($rounds,11);
613         &$movekey       (&QWP(-16,$key),"xmm0")         # round 0
614         &aeskeygenassist("xmm1","xmm2",0x01);           # round 1,2
615         &call           (&label("key_192a_cold"));
616         &aeskeygenassist("xmm1","xmm2",0x02);           # round 2,3
617         &call           (&label("key_192b"));
618         &aeskeygenassist("xmm1","xmm2",0x04);           # round 4,5
619         &call           (&label("key_192a"));
620         &aeskeygenassist("xmm1","xmm2",0x08);           # round 5,6
621         &call           (&label("key_192b"));
622         &aeskeygenassist("xmm1","xmm2",0x10);           # round 7,8
623         &call           (&label("key_192a"));
624         &aeskeygenassist("xmm1","xmm2",0x20);           # round 8,9
625         &call           (&label("key_192b"));
626         &aeskeygenassist("xmm1","xmm2",0x40);           # round 10,11
627         &call           (&label("key_192a"));
628         &aeskeygenassist("xmm1","xmm2",0x80);           # round 11,12
629         &call           (&label("key_192b"));
630         &$movekey       (&QWP(0,$key),"xmm0");
631         &mov            (&DWP(48,$key),$rounds);
632         &xor            ("eax","eax");
633         &ret();
634
635 &set_label("key_192a",16);
636         &$movekey       (&QWP(0,$key),"xmm0");
637         &lea            ($key,&DWP(16,$key));
638 &set_label("key_192a_cold",16);
639         &movaps         ("xmm5","xmm2");
640 &set_label("key_192b_warm");
641         &shufps         ("xmm4","xmm0",0b00010000);
642         &movaps         ("xmm3","xmm2");
643         &pxor           ("xmm0","xmm4");
644         &shufps         ("xmm4","xmm0",0b10001100);
645         &pslldq         ("xmm3",4);
646         &pxor           ("xmm0","xmm4");
647         &pshufd         ("xmm1","xmm1",0b01010101);     # critical path
648         &pxor           ("xmm2","xmm3");
649         &pxor           ("xmm0","xmm1");
650         &pshufd         ("xmm3","xmm0",0b11111111);
651         &pxor           ("xmm2","xmm3");
652         &ret();
653
654 &set_label("key_192b",16);
655         &movaps         ("xmm3","xmm0");
656         &shufps         ("xmm5","xmm0",0b01000100);
657         &$movekey       (&QWP(0,$key),"xmm5");
658         &shufps         ("xmm3","xmm2",0b01001110);
659         &$movekey       (&QWP(16,$key),"xmm3");
660         &lea            ($key,&DWP(32,$key));
661         &jmp            (&label("key_192b_warm"));
662
663 &set_label("14rounds",16);
664         &movups         ("xmm2",&QWP(16,"eax"));        # remaining half of *userKey
665         &mov            ($rounds,13);
666         &lea            ($key,&DWP(16,$key));
667         &$movekey       (&QWP(-32,$key),"xmm0");        # round 0
668         &$movekey       (&QWP(-16,$key),"xmm2");        # round 1
669         &aeskeygenassist("xmm1","xmm2",0x01);           # round 2
670         &call           (&label("key_256a_cold"));
671         &aeskeygenassist("xmm1","xmm0",0x01);           # round 3
672         &call           (&label("key_256b"));
673         &aeskeygenassist("xmm1","xmm2",0x02);           # round 4
674         &call           (&label("key_256a"));
675         &aeskeygenassist("xmm1","xmm0",0x02);           # round 5
676         &call           (&label("key_256b"));
677         &aeskeygenassist("xmm1","xmm2",0x04);           # round 6
678         &call           (&label("key_256a"));
679         &aeskeygenassist("xmm1","xmm0",0x04);           # round 7
680         &call           (&label("key_256b"));
681         &aeskeygenassist("xmm1","xmm2",0x08);           # round 8
682         &call           (&label("key_256a"));
683         &aeskeygenassist("xmm1","xmm0",0x08);           # round 9
684         &call           (&label("key_256b"));
685         &aeskeygenassist("xmm1","xmm2",0x10);           # round 10
686         &call           (&label("key_256a"));
687         &aeskeygenassist("xmm1","xmm0",0x10);           # round 11
688         &call           (&label("key_256b"));
689         &aeskeygenassist("xmm1","xmm2",0x20);           # round 12
690         &call           (&label("key_256a"));
691         &aeskeygenassist("xmm1","xmm0",0x20);           # round 13
692         &call           (&label("key_256b"));
693         &aeskeygenassist("xmm1","xmm2",0x40);           # round 14
694         &call           (&label("key_256a"));
695         &$movekey       (&QWP(0,$key),"xmm0");
696         &mov            (&DWP(16,$key),$rounds);
697         &xor            ("eax","eax");
698         &ret();
699
700 &set_label("key_256a",16);
701         &$movekey       (&QWP(0,$key),"xmm2");
702         &lea            ($key,&DWP(16,$key));
703 &set_label("key_256a_cold");
704         &shufps         ("xmm4","xmm0",0b00010000);
705         &pxor           ("xmm0","xmm4");
706         &shufps         ("xmm4","xmm0",0b10001100);
707         &pxor           ("xmm0","xmm4");
708         &pshufd         ("xmm1","xmm1",0b11111111);     # critical path
709         &pxor           ("xmm0","xmm1");
710         &ret();
711
712 &set_label("key_256b",16);
713         &$movekey       (&QWP(0,$key),"xmm0");
714         &lea            ($key,&DWP(16,$key));
715
716         &shufps         ("xmm4","xmm2",0b00010000);
717         &pxor           ("xmm2","xmm4");
718         &shufps         ("xmm4","xmm2",0b10001100);
719         &pxor           ("xmm2","xmm4");
720         &pshufd         ("xmm1","xmm1",0b10101010);     # critical path
721         &pxor           ("xmm2","xmm1");
722         &ret();
723
724 &set_label("bad_pointer",4);
725         &mov    ("eax",-1);
726         &ret    ();
727 &set_label("bad_keybits",4);
728         &mov    ("eax",-2);
729         &ret    ();
730 &function_end_B("_aesni_set_encrypt_key");
731
732 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
733 #                              AES_KEY *key)
734 &function_begin_B("${PREFIX}_set_encrypt_key");
735         &mov    ("eax",&wparam(0));
736         &mov    ($rounds,&wparam(1));
737         &mov    ($key,&wparam(2));
738         &call   ("_aesni_set_encrypt_key");
739         &ret    ();
740 &function_end_B("${PREFIX}_set_encrypt_key");
741
742 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
743 #                              AES_KEY *key)
744 &function_begin_B("${PREFIX}_set_decrypt_key");
745         &mov    ("eax",&wparam(0));
746         &mov    ($rounds,&wparam(1));
747         &mov    ($key,&wparam(2));
748         &call   ("_aesni_set_encrypt_key");
749         &mov    ($key,&wparam(2));
750         &shl    ($rounds,4)     # rounds-1 after _aesni_set_encrypt_key
751         &test   ("eax","eax");
752         &jnz    (&label("dec_key_ret"));
753         &lea    ("eax",&DWP(16,$key,$rounds));  # end of key schedule
754
755         &$movekey       ("xmm0",&QWP(0,$key));  # just swap
756         &$movekey       ("xmm1",&QWP(0,"eax"));
757         &$movekey       (&QWP(0,"eax"),"xmm0");
758         &$movekey       (&QWP(0,$key),"xmm1");
759         &lea            ($key,&DWP(16,$key));
760         &lea            ("eax",&DWP(-16,"eax"));
761
762 &set_label("dec_key_inverse");
763         &$movekey       ("xmm0",&QWP(0,$key));  # swap and inverse
764         &$movekey       ("xmm1",&QWP(0,"eax"));
765         &aesimc         ("xmm0","xmm0");
766         &aesimc         ("xmm1","xmm1");
767         &lea            ($key,&DWP(16,$key));
768         &lea            ("eax",&DWP(-16,"eax"));
769         &cmp            ("eax",$key);
770         &$movekey       (&QWP(16,"eax"),"xmm0");
771         &$movekey       (&QWP(-16,$key),"xmm1");
772         &ja             (&label("dec_key_inverse"));
773
774         &$movekey       ("xmm0",&QWP(0,$key));  # inverse middle
775         &aesimc         ("xmm0","xmm0");
776         &$movekey       (&QWP(0,$key),"xmm0");
777
778         &xor            ("eax","eax");          # return success
779 &set_label("dec_key_ret");
780         &ret    ();
781 &function_end_B("${PREFIX}_set_decrypt_key");
782 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
783
784 &asm_finish();