2 # Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ######################################################################
11 ## Constant-time SSSE3 AES core implementation.
14 ## By Mike Hamburg (Stanford University), 2009
17 ## For details see http://shiftleft.org/papers/vector_aes/ and
18 ## http://crypto.stanford.edu/vpaes/.
20 ######################################################################
23 # Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
24 # aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
25 # doesn't handle partial vectors (doesn't have to if called from
26 # EVP only). "Drop-in" implies that this module doesn't share key
27 # schedule structure with the original nor does it make assumption
28 # about its alignment...
30 # Performance summary. aes-586.pl column lists large-block CBC
31 # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
32 # byte processed with 128-bit key, and vpaes-x86.pl column - [also
33 # large-block CBC] encrypt/decrypt.
35 # aes-586.pl vpaes-x86.pl
37 # Core 2(**) 28.1/41.4/18.3 21.9/25.2(***)
38 # Nehalem 27.9/40.4/18.1 10.2/11.9
39 # Atom 70.7/92.1/60.1 61.1/75.4(***)
40 # Silvermont 45.4/62.9/24.1 49.2/61.1(***)
42 # (*) "Hyper-threading" in the context refers rather to cache shared
43 # among multiple cores, than to specifically Intel HTT. As vast
44 # majority of contemporary cores share cache, slower code path
45 # is common place. In other words "with-hyper-threading-off"
46 # results are presented mostly for reference purposes.
48 # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
50 # (***) Less impressive improvement on Core 2 and Atom is due to slow
51 # pshufb, yet it's respectable +28%/64% improvement on Core 2
52 # and +15% on Atom (as implied, over "hyper-threading-safe"
57 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
58 push(@INC,"${dir}","${dir}../../perlasm");
61 $output = pop and open STDOUT,">$output";
63 &asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
67 my ($round, $base, $magic, $key, $const, $inp, $out)=
68 ("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
70 &static_label("_vpaes_consts");
71 &static_label("_vpaes_schedule_low_round");
73 &set_label("_vpaes_consts",64);
74 $k_inv=-0x30; # inv, inva
75 &data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
76 &data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
79 &data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
81 $k_ipt=0x00; # input transform (lo, hi)
82 &data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
83 &data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
85 $k_sb1=0x20; # sb1u, sb1t
86 &data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
87 &data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
88 $k_sb2=0x40; # sb2u, sb2t
89 &data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
90 &data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
91 $k_sbo=0x60; # sbou, sbot
92 &data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
93 &data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
95 $k_mc_forward=0x80; # mc_forward
96 &data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
97 &data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
98 &data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
99 &data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
101 $k_mc_backward=0xc0; # mc_backward
102 &data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
103 &data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
104 &data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
105 &data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
108 &data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
109 &data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
110 &data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
111 &data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
113 $k_rcon=0x140; # rcon
114 &data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
116 $k_s63=0x150; # s63: all equal to 0x63 transformed
117 &data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
119 $k_opt=0x160; # output transform
120 &data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
121 &data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
123 $k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
124 &data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
125 &data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
128 ## Key schedule constants
130 $k_dksd=0x1a0; # decryption key schedule: invskew x*D
131 &data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
132 &data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
133 $k_dksb=0x1c0; # decryption key schedule: invskew x*B
134 &data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
135 &data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
136 $k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63
137 &data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
138 &data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
139 $k_dks9=0x200; # decryption key schedule: invskew x*9
140 &data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
141 &data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
145 ## Round function constants
147 $k_dipt=0x220; # decryption input transform
148 &data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
149 &data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
151 $k_dsb9=0x240; # decryption sbox output *9*u, *9*t
152 &data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
153 &data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
154 $k_dsbd=0x260; # decryption sbox output *D*u, *D*t
155 &data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
156 &data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
157 $k_dsbb=0x280; # decryption sbox output *B*u, *B*t
158 &data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
159 &data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
160 $k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t
161 &data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
162 &data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
163 $k_dsbo=0x2c0; # decryption sbox final output
164 &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
165 &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
166 &asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
169 &function_begin_B("_vpaes_preheat");
170 &add ($const,&DWP(0,"esp"));
171 &movdqa ("xmm7",&QWP($k_inv,$const));
172 &movdqa ("xmm6",&QWP($k_s0F,$const));
174 &function_end_B("_vpaes_preheat");
179 ## AES-encrypt %xmm0.
183 ## %xmm6-%xmm7 as in _vpaes_preheat
184 ## (%edx) = scheduled keys
187 ## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
190 &function_begin_B("_vpaes_encrypt_core");
192 &mov ($round,&DWP(240,$key));
193 &movdqa ("xmm1","xmm6")
194 &movdqa ("xmm2",&QWP($k_ipt,$const));
195 &pandn ("xmm1","xmm0");
196 &pand ("xmm0","xmm6");
197 &movdqu ("xmm5",&QWP(0,$key));
198 &pshufb ("xmm2","xmm0");
199 &movdqa ("xmm0",&QWP($k_ipt+16,$const));
200 &pxor ("xmm2","xmm5");
203 &pshufb ("xmm0","xmm1");
204 &lea ($base,&DWP($k_mc_backward,$const));
205 &pxor ("xmm0","xmm2");
206 &jmp (&label("enc_entry"));
209 &set_label("enc_loop",16);
210 # middle of middle round
211 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
212 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
213 &pshufb ("xmm4","xmm2"); # 4 = sb1u
214 &pshufb ("xmm0","xmm3"); # 0 = sb1t
215 &pxor ("xmm4","xmm5"); # 4 = sb1u + k
216 &movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
217 &pxor ("xmm0","xmm4"); # 0 = A
218 &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
219 &pshufb ("xmm5","xmm2"); # 4 = sb2u
220 &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
221 &movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
222 &pshufb ("xmm2","xmm3"); # 2 = sb2t
223 &movdqa ("xmm3","xmm0"); # 3 = A
224 &pxor ("xmm2","xmm5"); # 2 = 2A
225 &pshufb ("xmm0","xmm1"); # 0 = B
226 &add ($key,16); # next key
227 &pxor ("xmm0","xmm2"); # 0 = 2A+B
228 &pshufb ("xmm3","xmm4"); # 3 = D
229 &add ($magic,16); # next mc
230 &pxor ("xmm3","xmm0"); # 3 = 2A+B+D
231 &pshufb ("xmm0","xmm1"); # 0 = 2B+C
232 &and ($magic,0x30); # ... mod 4
233 &sub ($round,1); # nr--
234 &pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
236 &set_label("enc_entry");
238 &movdqa ("xmm1","xmm6"); # 1 : i
239 &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
240 &pandn ("xmm1","xmm0"); # 1 = i<<4
241 &psrld ("xmm1",4); # 1 = i
242 &pand ("xmm0","xmm6"); # 0 = k
243 &pshufb ("xmm5","xmm0"); # 2 = a/k
244 &movdqa ("xmm3","xmm7"); # 3 : 1/i
245 &pxor ("xmm0","xmm1"); # 0 = j
246 &pshufb ("xmm3","xmm1"); # 3 = 1/i
247 &movdqa ("xmm4","xmm7"); # 4 : 1/j
248 &pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
249 &pshufb ("xmm4","xmm0"); # 4 = 1/j
250 &movdqa ("xmm2","xmm7"); # 2 : 1/iak
251 &pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
252 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
253 &movdqa ("xmm3","xmm7"); # 3 : 1/jak
254 &pxor ("xmm2","xmm0"); # 2 = io
255 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
256 &movdqu ("xmm5",&QWP(0,$key));
257 &pxor ("xmm3","xmm1"); # 3 = jo
258 &jnz (&label("enc_loop"));
260 # middle of last round
261 &movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
262 &movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
263 &pshufb ("xmm4","xmm2"); # 4 = sbou
264 &pxor ("xmm4","xmm5"); # 4 = sb1u + k
265 &pshufb ("xmm0","xmm3"); # 0 = sb1t
266 &movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
267 &pxor ("xmm0","xmm4"); # 0 = A
268 &pshufb ("xmm0","xmm1");
270 &function_end_B("_vpaes_encrypt_core");
275 ## Same API as encryption core.
277 &function_begin_B("_vpaes_decrypt_core");
278 &lea ($base,&DWP($k_dsbd,$const));
279 &mov ($round,&DWP(240,$key));
280 &movdqa ("xmm1","xmm6");
281 &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
282 &pandn ("xmm1","xmm0");
283 &mov ($magic,$round);
285 &movdqu ("xmm5",&QWP(0,$key));
287 &pand ("xmm0","xmm6");
288 &pshufb ("xmm2","xmm0");
289 &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
291 &pshufb ("xmm0","xmm1");
293 &pxor ("xmm2","xmm5");
294 &movdqa ("xmm5",&QWP($k_mc_forward+48,$const));
295 &pxor ("xmm0","xmm2");
297 &lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
298 &jmp (&label("dec_entry"));
300 &set_label("dec_loop",16);
302 ## Inverse mix columns
304 &movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
305 &movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t
306 &pshufb ("xmm4","xmm2"); # 4 = sb9u
307 &pshufb ("xmm1","xmm3"); # 0 = sb9t
308 &pxor ("xmm0","xmm4");
309 &movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
310 &pxor ("xmm0","xmm1"); # 0 = ch
311 &movdqa ("xmm1",&QWP(0x10,$base)); # 0 : sbdt
313 &pshufb ("xmm4","xmm2"); # 4 = sbdu
314 &pshufb ("xmm0","xmm5"); # MC ch
315 &pshufb ("xmm1","xmm3"); # 0 = sbdt
316 &pxor ("xmm0","xmm4"); # 4 = ch
317 &movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
318 &pxor ("xmm0","xmm1"); # 0 = ch
319 &movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt
321 &pshufb ("xmm4","xmm2"); # 4 = sbbu
322 &pshufb ("xmm0","xmm5"); # MC ch
323 &pshufb ("xmm1","xmm3"); # 0 = sbbt
324 &pxor ("xmm0","xmm4"); # 4 = ch
325 &movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
326 &pxor ("xmm0","xmm1"); # 0 = ch
327 &movdqa ("xmm1",&QWP(0x50,$base)); # 0 : sbet
329 &pshufb ("xmm4","xmm2"); # 4 = sbeu
330 &pshufb ("xmm0","xmm5"); # MC ch
331 &pshufb ("xmm1","xmm3"); # 0 = sbet
332 &pxor ("xmm0","xmm4"); # 4 = ch
333 &add ($key,16); # next round key
334 &palignr("xmm5","xmm5",12);
335 &pxor ("xmm0","xmm1"); # 0 = ch
336 &sub ($round,1); # nr--
338 &set_label("dec_entry");
340 &movdqa ("xmm1","xmm6"); # 1 : i
341 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
342 &pandn ("xmm1","xmm0"); # 1 = i<<4
343 &pand ("xmm0","xmm6"); # 0 = k
344 &psrld ("xmm1",4); # 1 = i
345 &pshufb ("xmm2","xmm0"); # 2 = a/k
346 &movdqa ("xmm3","xmm7"); # 3 : 1/i
347 &pxor ("xmm0","xmm1"); # 0 = j
348 &pshufb ("xmm3","xmm1"); # 3 = 1/i
349 &movdqa ("xmm4","xmm7"); # 4 : 1/j
350 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
351 &pshufb ("xmm4","xmm0"); # 4 = 1/j
352 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
353 &movdqa ("xmm2","xmm7"); # 2 : 1/iak
354 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
355 &movdqa ("xmm3","xmm7"); # 3 : 1/jak
356 &pxor ("xmm2","xmm0"); # 2 = io
357 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
358 &movdqu ("xmm0",&QWP(0,$key));
359 &pxor ("xmm3","xmm1"); # 3 = jo
360 &jnz (&label("dec_loop"));
362 # middle of last round
363 &movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou
364 &pshufb ("xmm4","xmm2"); # 4 = sbou
365 &pxor ("xmm4","xmm0"); # 4 = sb1u + k
366 &movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot
367 &movdqa ("xmm2",&QWP(0,$magic));
368 &pshufb ("xmm0","xmm3"); # 0 = sb1t
369 &pxor ("xmm0","xmm4"); # 0 = A
370 &pshufb ("xmm0","xmm2");
372 &function_end_B("_vpaes_decrypt_core");
374 ########################################################
376 ## AES key schedule ##
378 ########################################################
379 &function_begin_B("_vpaes_schedule_core");
380 &add ($const,&DWP(0,"esp"));
381 &movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
382 &movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
385 &movdqa ("xmm3","xmm0");
386 &lea ($base,&DWP($k_ipt,$const));
387 &movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
388 &call ("_vpaes_schedule_transform");
389 &movdqa ("xmm7","xmm0");
392 &jnz (&label("schedule_am_decrypting"));
394 # encrypting, output zeroth round key after transform
395 &movdqu (&QWP(0,$key),"xmm0");
396 &jmp (&label("schedule_go"));
398 &set_label("schedule_am_decrypting");
399 # decrypting, output zeroth round key after shiftrows
400 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
401 &pshufb ("xmm3","xmm1");
402 &movdqu (&QWP(0,$key),"xmm3");
405 &set_label("schedule_go");
407 &ja (&label("schedule_256"));
408 &je (&label("schedule_192"));
414 ## 128-bit specific part of key schedule.
416 ## This schedule is really simple, because all its parts
417 ## are accomplished by the subroutines.
419 &set_label("schedule_128");
422 &set_label("loop_schedule_128");
423 &call ("_vpaes_schedule_round");
425 &jz (&label("schedule_mangle_last"));
426 &call ("_vpaes_schedule_mangle"); # write output
427 &jmp (&label("loop_schedule_128"));
432 ## 192-bit specific part of key schedule.
434 ## The main body of this schedule is the same as the 128-bit
435 ## schedule, but with more smearing. The long, high side is
436 ## stored in %xmm7 as before, and the short, low side is in
437 ## the high bits of %xmm6.
439 ## This schedule is somewhat nastier, however, because each
440 ## round produces 192 bits of key material, or 1.5 round keys.
441 ## Therefore, on each cycle we do 2 rounds and produce 3 round
444 &set_label("schedule_192",16);
445 &movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
446 &call ("_vpaes_schedule_transform"); # input transform
447 &movdqa ("xmm6","xmm0"); # save short part
448 &pxor ("xmm4","xmm4"); # clear 4
449 &movhlps("xmm6","xmm4"); # clobber low side with zeros
452 &set_label("loop_schedule_192");
453 &call ("_vpaes_schedule_round");
454 &palignr("xmm0","xmm6",8);
455 &call ("_vpaes_schedule_mangle"); # save key n
456 &call ("_vpaes_schedule_192_smear");
457 &call ("_vpaes_schedule_mangle"); # save key n+1
458 &call ("_vpaes_schedule_round");
460 &jz (&label("schedule_mangle_last"));
461 &call ("_vpaes_schedule_mangle"); # save key n+2
462 &call ("_vpaes_schedule_192_smear");
463 &jmp (&label("loop_schedule_192"));
468 ## 256-bit specific part of key schedule.
470 ## The structure here is very similar to the 128-bit
471 ## schedule, but with an additional "low side" in
472 ## %xmm6. The low side's rounds are the same as the
473 ## high side's, except no rcon and no rotation.
475 &set_label("schedule_256",16);
476 &movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
477 &call ("_vpaes_schedule_transform"); # input transform
480 &set_label("loop_schedule_256");
481 &call ("_vpaes_schedule_mangle"); # output low result
482 &movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
485 &call ("_vpaes_schedule_round");
487 &jz (&label("schedule_mangle_last"));
488 &call ("_vpaes_schedule_mangle");
490 # low round. swap xmm7 and xmm6
491 &pshufd ("xmm0","xmm0",0xFF);
492 &movdqa (&QWP(20,"esp"),"xmm7");
493 &movdqa ("xmm7","xmm6");
494 &call ("_vpaes_schedule_low_round");
495 &movdqa ("xmm7",&QWP(20,"esp"));
497 &jmp (&label("loop_schedule_256"));
500 ## .aes_schedule_mangle_last
502 ## Mangler for last round of key schedule
504 ## when encrypting, outputs out(%xmm0) ^ 63
505 ## when decrypting, outputs unskew(%xmm0)
507 ## Always called right before return... jumps to cleanup and exits
509 &set_label("schedule_mangle_last",16);
510 # schedule last round key from xmm0
511 &lea ($base,&DWP($k_deskew,$const));
513 &jnz (&label("schedule_mangle_last_dec"));
516 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
517 &pshufb ("xmm0","xmm1"); # output permute
518 &lea ($base,&DWP($k_opt,$const)); # prepare to output transform
521 &set_label("schedule_mangle_last_dec");
523 &pxor ("xmm0",&QWP($k_s63,$const));
524 &call ("_vpaes_schedule_transform"); # output transform
525 &movdqu (&QWP(0,$key),"xmm0"); # save last key
528 &pxor ("xmm0","xmm0");
529 &pxor ("xmm1","xmm1");
530 &pxor ("xmm2","xmm2");
531 &pxor ("xmm3","xmm3");
532 &pxor ("xmm4","xmm4");
533 &pxor ("xmm5","xmm5");
534 &pxor ("xmm6","xmm6");
535 &pxor ("xmm7","xmm7");
537 &function_end_B("_vpaes_schedule_core");
540 ## .aes_schedule_192_smear
542 ## Smear the short, low side in the 192-bit key schedule.
545 ## %xmm7: high side, b a x y
546 ## %xmm6: low side, d c 0 0
550 ## %xmm6: b+c+d b+c 0 0
551 ## %xmm0: b+c+d b+c b a
553 &function_begin_B("_vpaes_schedule_192_smear");
554 &pshufd ("xmm1","xmm6",0x80); # d c 0 0 -> c 0 0 0
555 &pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a
556 &pxor ("xmm6","xmm1"); # -> c+d c 0 0
557 &pxor ("xmm1","xmm1");
558 &pxor ("xmm6","xmm0"); # -> b+c+d b+c b a
559 &movdqa ("xmm0","xmm6");
560 &movhlps("xmm6","xmm1"); # clobber low side with zeros
562 &function_end_B("_vpaes_schedule_192_smear");
565 ## .aes_schedule_round
567 ## Runs one main round of the key schedule on %xmm0, %xmm7
569 ## Specifically, runs subbytes on the high dword of %xmm0
570 ## then rotates it by one byte and xors into the low dword of
573 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
576 ## Smears the dwords of %xmm7 by xoring the low into the
577 ## second low, result into third, result into highest.
579 ## Returns results in %xmm7 = %xmm0.
580 ## Clobbers %xmm1-%xmm5.
582 &function_begin_B("_vpaes_schedule_round");
583 # extract rcon from xmm8
584 &movdqa ("xmm2",&QWP(8,"esp")); # xmm8
585 &pxor ("xmm1","xmm1");
586 &palignr("xmm1","xmm2",15);
587 &palignr("xmm2","xmm2",15);
588 &pxor ("xmm7","xmm1");
591 &pshufd ("xmm0","xmm0",0xFF);
592 &palignr("xmm0","xmm0",1);
595 &movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
597 # low round: same as high round, but no rotation and no rcon.
598 &set_label("_vpaes_schedule_low_round");
600 &movdqa ("xmm1","xmm7");
602 &pxor ("xmm7","xmm1");
603 &movdqa ("xmm1","xmm7");
605 &pxor ("xmm7","xmm1");
606 &pxor ("xmm7",&QWP($k_s63,$const));
609 &movdqa ("xmm4",&QWP($k_s0F,$const));
610 &movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
611 &movdqa ("xmm1","xmm4");
612 &pandn ("xmm1","xmm0");
613 &psrld ("xmm1",4); # 1 = i
614 &pand ("xmm0","xmm4"); # 0 = k
615 &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
616 &pshufb ("xmm2","xmm0"); # 2 = a/k
617 &pxor ("xmm0","xmm1"); # 0 = j
618 &movdqa ("xmm3","xmm5"); # 3 : 1/i
619 &pshufb ("xmm3","xmm1"); # 3 = 1/i
620 &pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
621 &movdqa ("xmm4","xmm5"); # 4 : 1/j
622 &pshufb ("xmm4","xmm0"); # 4 = 1/j
623 &pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
624 &movdqa ("xmm2","xmm5"); # 2 : 1/iak
625 &pshufb ("xmm2","xmm3"); # 2 = 1/iak
626 &pxor ("xmm2","xmm0"); # 2 = io
627 &movdqa ("xmm3","xmm5"); # 3 : 1/jak
628 &pshufb ("xmm3","xmm4"); # 3 = 1/jak
629 &pxor ("xmm3","xmm1"); # 3 = jo
630 &movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
631 &pshufb ("xmm4","xmm2"); # 4 = sbou
632 &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
633 &pshufb ("xmm0","xmm3"); # 0 = sb1t
634 &pxor ("xmm0","xmm4"); # 0 = sbox output
636 # add in smeared stuff
637 &pxor ("xmm0","xmm7");
638 &movdqa ("xmm7","xmm0");
640 &function_end_B("_vpaes_schedule_round");
643 ## .aes_schedule_transform
645 ## Linear-transform %xmm0 according to tables at (%ebx)
648 ## Clobbers %xmm1, %xmm2
650 &function_begin_B("_vpaes_schedule_transform");
651 &movdqa ("xmm2",&QWP($k_s0F,$const));
652 &movdqa ("xmm1","xmm2");
653 &pandn ("xmm1","xmm0");
655 &pand ("xmm0","xmm2");
656 &movdqa ("xmm2",&QWP(0,$base));
657 &pshufb ("xmm2","xmm0");
658 &movdqa ("xmm0",&QWP(16,$base));
659 &pshufb ("xmm0","xmm1");
660 &pxor ("xmm0","xmm2");
662 &function_end_B("_vpaes_schedule_transform");
665 ## .aes_schedule_mangle
667 ## Mangle xmm0 from (basis-transformed) standard version
672 ## multiply by circulant 0,1,1,1
673 ## apply shiftrows transform
677 ## multiply by "inverse mixcolumns" circulant E,B,D,9
679 ## apply shiftrows transform
682 ## Writes out to (%edx), and increments or decrements it
683 ## Keeps track of round number mod 4 in %ecx
685 ## Clobbers xmm1-xmm5
687 &function_begin_B("_vpaes_schedule_mangle");
688 &movdqa ("xmm4","xmm0"); # save xmm0 for later
689 &movdqa ("xmm5",&QWP($k_mc_forward,$const));
691 &jnz (&label("schedule_mangle_dec"));
695 &pxor ("xmm4",&QWP($k_s63,$const));
696 &pshufb ("xmm4","xmm5");
697 &movdqa ("xmm3","xmm4");
698 &pshufb ("xmm4","xmm5");
699 &pxor ("xmm3","xmm4");
700 &pshufb ("xmm4","xmm5");
701 &pxor ("xmm3","xmm4");
703 &jmp (&label("schedule_mangle_both"));
705 &set_label("schedule_mangle_dec",16);
706 # inverse mix columns
707 &movdqa ("xmm2",&QWP($k_s0F,$const));
708 &lea ($inp,&DWP($k_dksd,$const));
709 &movdqa ("xmm1","xmm2");
710 &pandn ("xmm1","xmm4");
711 &psrld ("xmm1",4); # 1 = hi
712 &pand ("xmm4","xmm2"); # 4 = lo
714 &movdqa ("xmm2",&QWP(0,$inp));
715 &pshufb ("xmm2","xmm4");
716 &movdqa ("xmm3",&QWP(0x10,$inp));
717 &pshufb ("xmm3","xmm1");
718 &pxor ("xmm3","xmm2");
719 &pshufb ("xmm3","xmm5");
721 &movdqa ("xmm2",&QWP(0x20,$inp));
722 &pshufb ("xmm2","xmm4");
723 &pxor ("xmm2","xmm3");
724 &movdqa ("xmm3",&QWP(0x30,$inp));
725 &pshufb ("xmm3","xmm1");
726 &pxor ("xmm3","xmm2");
727 &pshufb ("xmm3","xmm5");
729 &movdqa ("xmm2",&QWP(0x40,$inp));
730 &pshufb ("xmm2","xmm4");
731 &pxor ("xmm2","xmm3");
732 &movdqa ("xmm3",&QWP(0x50,$inp));
733 &pshufb ("xmm3","xmm1");
734 &pxor ("xmm3","xmm2");
735 &pshufb ("xmm3","xmm5");
737 &movdqa ("xmm2",&QWP(0x60,$inp));
738 &pshufb ("xmm2","xmm4");
739 &pxor ("xmm2","xmm3");
740 &movdqa ("xmm3",&QWP(0x70,$inp));
741 &pshufb ("xmm3","xmm1");
742 &pxor ("xmm3","xmm2");
746 &set_label("schedule_mangle_both");
747 &movdqa ("xmm1",&QWP($k_sr,$const,$magic));
748 &pshufb ("xmm3","xmm1");
751 &movdqu (&QWP(0,$key),"xmm3");
753 &function_end_B("_vpaes_schedule_mangle");
756 # Interface to OpenSSL
758 &function_begin("${PREFIX}_set_encrypt_key");
759 &mov ($inp,&wparam(0)); # inp
760 &lea ($base,&DWP(-56,"esp"));
761 &mov ($round,&wparam(1)); # bits
763 &mov ($key,&wparam(2)); # key
764 &xchg ($base,"esp"); # alloca
765 &mov (&DWP(48,"esp"),$base);
770 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
774 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
775 &call ("_vpaes_schedule_core");
776 &set_label("pic_point");
778 &mov ("esp",&DWP(48,"esp"));
780 &function_end("${PREFIX}_set_encrypt_key");
782 &function_begin("${PREFIX}_set_decrypt_key");
783 &mov ($inp,&wparam(0)); # inp
784 &lea ($base,&DWP(-56,"esp"));
785 &mov ($round,&wparam(1)); # bits
787 &mov ($key,&wparam(2)); # key
788 &xchg ($base,"esp"); # alloca
789 &mov (&DWP(48,"esp"),$base);
794 &mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
796 &lea ($key,&DWP(16,$key,$base));
799 &mov ($magic,$round);
802 &xor ($magic,32); # nbist==192?0:32;
804 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
805 &call ("_vpaes_schedule_core");
806 &set_label("pic_point");
808 &mov ("esp",&DWP(48,"esp"));
810 &function_end("${PREFIX}_set_decrypt_key");
812 &function_begin("${PREFIX}_encrypt");
813 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
814 &call ("_vpaes_preheat");
815 &set_label("pic_point");
816 &mov ($inp,&wparam(0)); # inp
817 &lea ($base,&DWP(-56,"esp"));
818 &mov ($out,&wparam(1)); # out
820 &mov ($key,&wparam(2)); # key
821 &xchg ($base,"esp"); # alloca
822 &mov (&DWP(48,"esp"),$base);
824 &movdqu ("xmm0",&QWP(0,$inp));
825 &call ("_vpaes_encrypt_core");
826 &movdqu (&QWP(0,$out),"xmm0");
828 &mov ("esp",&DWP(48,"esp"));
829 &function_end("${PREFIX}_encrypt");
831 &function_begin("${PREFIX}_decrypt");
832 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
833 &call ("_vpaes_preheat");
834 &set_label("pic_point");
835 &mov ($inp,&wparam(0)); # inp
836 &lea ($base,&DWP(-56,"esp"));
837 &mov ($out,&wparam(1)); # out
839 &mov ($key,&wparam(2)); # key
840 &xchg ($base,"esp"); # alloca
841 &mov (&DWP(48,"esp"),$base);
843 &movdqu ("xmm0",&QWP(0,$inp));
844 &call ("_vpaes_decrypt_core");
845 &movdqu (&QWP(0,$out),"xmm0");
847 &mov ("esp",&DWP(48,"esp"));
848 &function_end("${PREFIX}_decrypt");
850 &function_begin("${PREFIX}_cbc_encrypt");
851 &mov ($inp,&wparam(0)); # inp
852 &mov ($out,&wparam(1)); # out
853 &mov ($round,&wparam(2)); # len
854 &mov ($key,&wparam(3)); # key
856 &jc (&label("cbc_abort"));
857 &lea ($base,&DWP(-56,"esp"));
858 &mov ($const,&wparam(4)); # ivp
860 &mov ($magic,&wparam(5)); # enc
861 &xchg ($base,"esp"); # alloca
862 &movdqu ("xmm1",&QWP(0,$const)); # load IV
864 &mov (&DWP(48,"esp"),$base);
866 &mov (&DWP(0,"esp"),$out); # save out
867 &mov (&DWP(4,"esp"),$key) # save key
868 &mov (&DWP(8,"esp"),$const); # save ivp
869 &mov ($out,$round); # $out works as $len
871 &lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
872 &call ("_vpaes_preheat");
873 &set_label("pic_point");
875 &je (&label("cbc_dec_loop"));
876 &jmp (&label("cbc_enc_loop"));
878 &set_label("cbc_enc_loop",16);
879 &movdqu ("xmm0",&QWP(0,$inp)); # load input
880 &pxor ("xmm0","xmm1"); # inp^=iv
881 &call ("_vpaes_encrypt_core");
882 &mov ($base,&DWP(0,"esp")); # restore out
883 &mov ($key,&DWP(4,"esp")); # restore key
884 &movdqa ("xmm1","xmm0");
885 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
886 &lea ($inp,&DWP(16,$inp));
888 &jnc (&label("cbc_enc_loop"));
889 &jmp (&label("cbc_done"));
891 &set_label("cbc_dec_loop",16);
892 &movdqu ("xmm0",&QWP(0,$inp)); # load input
893 &movdqa (&QWP(16,"esp"),"xmm1"); # save IV
894 &movdqa (&QWP(32,"esp"),"xmm0"); # save future IV
895 &call ("_vpaes_decrypt_core");
896 &mov ($base,&DWP(0,"esp")); # restore out
897 &mov ($key,&DWP(4,"esp")); # restore key
898 &pxor ("xmm0",&QWP(16,"esp")); # out^=iv
899 &movdqa ("xmm1",&QWP(32,"esp")); # load next IV
900 &movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
901 &lea ($inp,&DWP(16,$inp));
903 &jnc (&label("cbc_dec_loop"));
905 &set_label("cbc_done");
906 &mov ($base,&DWP(8,"esp")); # restore ivp
907 &mov ("esp",&DWP(48,"esp"));
908 &movdqu (&QWP(0,$base),"xmm1"); # write IV
909 &set_label("cbc_abort");
910 &function_end("${PREFIX}_cbc_encrypt");
914 close STDOUT or die "error closing STDOUT: $!";