2 # Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ######################################################################
11 ## Constant-time SSSE3 AES core implementation.
14 ## By Mike Hamburg (Stanford University), 2009
17 ## For details see http://shiftleft.org/papers/vector_aes/ and
18 ## http://crypto.stanford.edu/vpaes/.
20 ######################################################################
23 # Interface to OpenSSL as "almost" drop-in replacement for
24 # aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
25 # doesn't handle partial vectors (doesn't have to if called from
26 # EVP only). "Drop-in" implies that this module doesn't share key
27 # schedule structure with the original nor does it make assumption
28 # about its alignment...
30 # Performance summary. aes-x86_64.pl column lists large-block CBC
31 # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
32 # byte processed with 128-bit key, and vpaes-x86_64.pl column -
33 # [also large-block CBC] encrypt/decrypt.
35 # aes-x86_64.pl vpaes-x86_64.pl
37 # Core 2(**) 29.6/41.1/14.3 21.9/25.2(***)
38 # Nehalem 29.6/40.3/14.6 10.0/11.8
39 # Atom 57.3/74.2/32.1 60.9/77.2(***)
40 # Silvermont 52.7/64.0/19.5 48.8/60.8(***)
41 # Goldmont 38.9/49.0/17.8 10.6/12.6
43 # (*) "Hyper-threading" in the context refers rather to cache shared
44 # among multiple cores, than to specifically Intel HTT. As vast
45 # majority of contemporary cores share cache, slower code path
46 # is common place. In other words "with-hyper-threading-off"
47 # results are presented mostly for reference purposes.
49 # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
51 # (***) Less impressive improvement on Core 2 and Atom is due to slow
52 # pshufb, yet it's respectable +36%/62% improvement on Core 2
53 # (as implied, over "hyper-threading-safe" code path).
57 # $output is the last argument if it looks like a file (it has an extension)
58 # $flavour is the first argument if it doesn't look like a file
59 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
60 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
62 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
64 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
66 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
67 die "can't locate x86_64-xlate.pl";
69 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
70 or die "can't call $xlate: $!";
85 ## %xmm9-%xmm15 as in _vpaes_preheat
86 ## (%rdx) = scheduled keys
89 ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
90 ## Preserves %xmm6 - %xmm8 so you get some local vectors
93 .type _vpaes_encrypt_core,\@abi-omnipotent
101 movdqa .Lk_ipt(%rip), %xmm2 # iptlo
103 movdqu (%r9), %xmm5 # round0 key
107 movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
112 lea .Lk_mc_backward(%rip),%r10
117 # middle of middle round
118 movdqa %xmm13, %xmm4 # 4 : sb1u
119 movdqa %xmm12, %xmm0 # 0 : sb1t
120 pshufb %xmm2, %xmm4 # 4 = sb1u
121 pshufb %xmm3, %xmm0 # 0 = sb1t
122 pxor %xmm5, %xmm4 # 4 = sb1u + k
123 movdqa %xmm15, %xmm5 # 4 : sb2u
124 pxor %xmm4, %xmm0 # 0 = A
125 movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
126 pshufb %xmm2, %xmm5 # 4 = sb2u
127 movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
128 movdqa %xmm14, %xmm2 # 2 : sb2t
129 pshufb %xmm3, %xmm2 # 2 = sb2t
130 movdqa %xmm0, %xmm3 # 3 = A
131 pxor %xmm5, %xmm2 # 2 = 2A
132 pshufb %xmm1, %xmm0 # 0 = B
133 add \$16, %r9 # next key
134 pxor %xmm2, %xmm0 # 0 = 2A+B
135 pshufb %xmm4, %xmm3 # 3 = D
136 add \$16, %r11 # next mc
137 pxor %xmm0, %xmm3 # 3 = 2A+B+D
138 pshufb %xmm1, %xmm0 # 0 = 2B+C
139 and \$0x30, %r11 # ... mod 4
141 pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
145 movdqa %xmm9, %xmm1 # 1 : i
146 movdqa %xmm11, %xmm5 # 2 : a/k
147 pandn %xmm0, %xmm1 # 1 = i<<4
148 psrld \$4, %xmm1 # 1 = i
149 pand %xmm9, %xmm0 # 0 = k
150 pshufb %xmm0, %xmm5 # 2 = a/k
151 movdqa %xmm10, %xmm3 # 3 : 1/i
152 pxor %xmm1, %xmm0 # 0 = j
153 pshufb %xmm1, %xmm3 # 3 = 1/i
154 movdqa %xmm10, %xmm4 # 4 : 1/j
155 pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
156 pshufb %xmm0, %xmm4 # 4 = 1/j
157 movdqa %xmm10, %xmm2 # 2 : 1/iak
158 pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
159 pshufb %xmm3, %xmm2 # 2 = 1/iak
160 movdqa %xmm10, %xmm3 # 3 : 1/jak
161 pxor %xmm0, %xmm2 # 2 = io
162 pshufb %xmm4, %xmm3 # 3 = 1/jak
164 pxor %xmm1, %xmm3 # 3 = jo
167 # middle of last round
168 movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
169 movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
170 pshufb %xmm2, %xmm4 # 4 = sbou
171 pxor %xmm5, %xmm4 # 4 = sb1u + k
172 pshufb %xmm3, %xmm0 # 0 = sb1t
173 movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
174 pxor %xmm4, %xmm0 # 0 = A
178 .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
183 ## Same API as encryption core.
185 .type _vpaes_decrypt_core,\@abi-omnipotent
189 mov %rdx, %r9 # load key
192 movdqa .Lk_dipt(%rip), %xmm2 # iptlo
196 movdqu (%r9), %xmm5 # round0 key
200 movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi
202 lea .Lk_dsbd(%rip),%r10
206 movdqa .Lk_mc_forward+48(%rip), %xmm5
215 ## Inverse mix columns
217 movdqa -0x20(%r10),%xmm4 # 4 : sb9u
218 movdqa -0x10(%r10),%xmm1 # 0 : sb9t
219 pshufb %xmm2, %xmm4 # 4 = sb9u
220 pshufb %xmm3, %xmm1 # 0 = sb9t
222 movdqa 0x00(%r10),%xmm4 # 4 : sbdu
223 pxor %xmm1, %xmm0 # 0 = ch
224 movdqa 0x10(%r10),%xmm1 # 0 : sbdt
226 pshufb %xmm2, %xmm4 # 4 = sbdu
227 pshufb %xmm5, %xmm0 # MC ch
228 pshufb %xmm3, %xmm1 # 0 = sbdt
229 pxor %xmm4, %xmm0 # 4 = ch
230 movdqa 0x20(%r10),%xmm4 # 4 : sbbu
231 pxor %xmm1, %xmm0 # 0 = ch
232 movdqa 0x30(%r10),%xmm1 # 0 : sbbt
234 pshufb %xmm2, %xmm4 # 4 = sbbu
235 pshufb %xmm5, %xmm0 # MC ch
236 pshufb %xmm3, %xmm1 # 0 = sbbt
237 pxor %xmm4, %xmm0 # 4 = ch
238 movdqa 0x40(%r10),%xmm4 # 4 : sbeu
239 pxor %xmm1, %xmm0 # 0 = ch
240 movdqa 0x50(%r10),%xmm1 # 0 : sbet
242 pshufb %xmm2, %xmm4 # 4 = sbeu
243 pshufb %xmm5, %xmm0 # MC ch
244 pshufb %xmm3, %xmm1 # 0 = sbet
245 pxor %xmm4, %xmm0 # 4 = ch
246 add \$16, %r9 # next round key
247 palignr \$12, %xmm5, %xmm5
248 pxor %xmm1, %xmm0 # 0 = ch
253 movdqa %xmm9, %xmm1 # 1 : i
254 pandn %xmm0, %xmm1 # 1 = i<<4
255 movdqa %xmm11, %xmm2 # 2 : a/k
256 psrld \$4, %xmm1 # 1 = i
257 pand %xmm9, %xmm0 # 0 = k
258 pshufb %xmm0, %xmm2 # 2 = a/k
259 movdqa %xmm10, %xmm3 # 3 : 1/i
260 pxor %xmm1, %xmm0 # 0 = j
261 pshufb %xmm1, %xmm3 # 3 = 1/i
262 movdqa %xmm10, %xmm4 # 4 : 1/j
263 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
264 pshufb %xmm0, %xmm4 # 4 = 1/j
265 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
266 movdqa %xmm10, %xmm2 # 2 : 1/iak
267 pshufb %xmm3, %xmm2 # 2 = 1/iak
268 movdqa %xmm10, %xmm3 # 3 : 1/jak
269 pxor %xmm0, %xmm2 # 2 = io
270 pshufb %xmm4, %xmm3 # 3 = 1/jak
272 pxor %xmm1, %xmm3 # 3 = jo
275 # middle of last round
276 movdqa 0x60(%r10), %xmm4 # 3 : sbou
277 pshufb %xmm2, %xmm4 # 4 = sbou
278 pxor %xmm0, %xmm4 # 4 = sb1u + k
279 movdqa 0x70(%r10), %xmm0 # 0 : sbot
280 movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
281 pshufb %xmm3, %xmm0 # 0 = sb1t
282 pxor %xmm4, %xmm0 # 0 = A
286 .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
288 ########################################################
290 ## AES key schedule ##
292 ########################################################
293 .type _vpaes_schedule_core,\@abi-omnipotent
295 _vpaes_schedule_core:
300 # rcx = direction. 0=encrypt, 1=decrypt
302 call _vpaes_preheat # load the tables
303 movdqa .Lk_rcon(%rip), %xmm8 # load rcon
304 movdqu (%rdi), %xmm0 # load key (unaligned)
308 lea .Lk_ipt(%rip), %r11
309 call _vpaes_schedule_transform
312 lea .Lk_sr(%rip),%r10
314 jnz .Lschedule_am_decrypting
316 # encrypting, output zeroth round key after transform
320 .Lschedule_am_decrypting:
321 # decrypting, output zeroth round key after shiftrows
322 movdqa (%r8,%r10),%xmm1
336 ## 128-bit specific part of key schedule.
338 ## This schedule is really simple, because all its parts
339 ## are accomplished by the subroutines.
345 call _vpaes_schedule_round
347 jz .Lschedule_mangle_last
348 call _vpaes_schedule_mangle # write output
349 jmp .Loop_schedule_128
354 ## 192-bit specific part of key schedule.
356 ## The main body of this schedule is the same as the 128-bit
357 ## schedule, but with more smearing. The long, high side is
358 ## stored in %xmm7 as before, and the short, low side is in
359 ## the high bits of %xmm6.
361 ## This schedule is somewhat nastier, however, because each
362 ## round produces 192 bits of key material, or 1.5 round keys.
363 ## Therefore, on each cycle we do 2 rounds and produce 3 round
368 movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
369 call _vpaes_schedule_transform # input transform
370 movdqa %xmm0, %xmm6 # save short part
371 pxor %xmm4, %xmm4 # clear 4
372 movhlps %xmm4, %xmm6 # clobber low side with zeros
376 call _vpaes_schedule_round
377 palignr \$8,%xmm6,%xmm0
378 call _vpaes_schedule_mangle # save key n
379 call _vpaes_schedule_192_smear
380 call _vpaes_schedule_mangle # save key n+1
381 call _vpaes_schedule_round
383 jz .Lschedule_mangle_last
384 call _vpaes_schedule_mangle # save key n+2
385 call _vpaes_schedule_192_smear
386 jmp .Loop_schedule_192
391 ## 256-bit specific part of key schedule.
393 ## The structure here is very similar to the 128-bit
394 ## schedule, but with an additional "low side" in
395 ## %xmm6. The low side's rounds are the same as the
396 ## high side's, except no rcon and no rotation.
400 movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
401 call _vpaes_schedule_transform # input transform
405 call _vpaes_schedule_mangle # output low result
406 movdqa %xmm0, %xmm6 # save cur_lo in xmm6
409 call _vpaes_schedule_round
411 jz .Lschedule_mangle_last
412 call _vpaes_schedule_mangle
414 # low round. swap xmm7 and xmm6
415 pshufd \$0xFF, %xmm0, %xmm0
418 call _vpaes_schedule_low_round
421 jmp .Loop_schedule_256
425 ## .aes_schedule_mangle_last
427 ## Mangler for last round of key schedule
429 ## when encrypting, outputs out(%xmm0) ^ 63
430 ## when decrypting, outputs unskew(%xmm0)
432 ## Always called right before return... jumps to cleanup and exits
435 .Lschedule_mangle_last:
436 # schedule last round key from xmm0
437 lea .Lk_deskew(%rip),%r11 # prepare to deskew
439 jnz .Lschedule_mangle_last_dec
442 movdqa (%r8,%r10),%xmm1
443 pshufb %xmm1, %xmm0 # output permute
444 lea .Lk_opt(%rip), %r11 # prepare to output transform
447 .Lschedule_mangle_last_dec:
449 pxor .Lk_s63(%rip), %xmm0
450 call _vpaes_schedule_transform # output transform
451 movdqu %xmm0, (%rdx) # save last key
464 .size _vpaes_schedule_core,.-_vpaes_schedule_core
467 ## .aes_schedule_192_smear
469 ## Smear the short, low side in the 192-bit key schedule.
472 ## %xmm7: high side, b a x y
473 ## %xmm6: low side, d c 0 0
477 ## %xmm6: b+c+d b+c 0 0
478 ## %xmm0: b+c+d b+c b a
480 .type _vpaes_schedule_192_smear,\@abi-omnipotent
482 _vpaes_schedule_192_smear:
484 pshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
485 pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
486 pxor %xmm1, %xmm6 # -> c+d c 0 0
488 pxor %xmm0, %xmm6 # -> b+c+d b+c b a
490 movhlps %xmm1, %xmm6 # clobber low side with zeros
493 .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
496 ## .aes_schedule_round
498 ## Runs one main round of the key schedule on %xmm0, %xmm7
500 ## Specifically, runs subbytes on the high dword of %xmm0
501 ## then rotates it by one byte and xors into the low dword of
504 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
507 ## Smears the dwords of %xmm7 by xoring the low into the
508 ## second low, result into third, result into highest.
510 ## Returns results in %xmm7 = %xmm0.
511 ## Clobbers %xmm1-%xmm4, %r11.
513 .type _vpaes_schedule_round,\@abi-omnipotent
515 _vpaes_schedule_round:
517 # extract rcon from xmm8
519 palignr \$15, %xmm8, %xmm1
520 palignr \$15, %xmm8, %xmm8
524 pshufd \$0xFF, %xmm0, %xmm0
525 palignr \$1, %xmm0, %xmm0
529 # low round: same as high round, but no rotation and no rcon.
530 _vpaes_schedule_low_round:
538 pxor .Lk_s63(%rip), %xmm7
543 psrld \$4, %xmm1 # 1 = i
544 pand %xmm9, %xmm0 # 0 = k
545 movdqa %xmm11, %xmm2 # 2 : a/k
546 pshufb %xmm0, %xmm2 # 2 = a/k
547 pxor %xmm1, %xmm0 # 0 = j
548 movdqa %xmm10, %xmm3 # 3 : 1/i
549 pshufb %xmm1, %xmm3 # 3 = 1/i
550 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
551 movdqa %xmm10, %xmm4 # 4 : 1/j
552 pshufb %xmm0, %xmm4 # 4 = 1/j
553 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
554 movdqa %xmm10, %xmm2 # 2 : 1/iak
555 pshufb %xmm3, %xmm2 # 2 = 1/iak
556 pxor %xmm0, %xmm2 # 2 = io
557 movdqa %xmm10, %xmm3 # 3 : 1/jak
558 pshufb %xmm4, %xmm3 # 3 = 1/jak
559 pxor %xmm1, %xmm3 # 3 = jo
560 movdqa %xmm13, %xmm4 # 4 : sbou
561 pshufb %xmm2, %xmm4 # 4 = sbou
562 movdqa %xmm12, %xmm0 # 0 : sbot
563 pshufb %xmm3, %xmm0 # 0 = sb1t
564 pxor %xmm4, %xmm0 # 0 = sbox output
566 # add in smeared stuff
571 .size _vpaes_schedule_round,.-_vpaes_schedule_round
574 ## .aes_schedule_transform
576 ## Linear-transform %xmm0 according to tables at (%r11)
578 ## Requires that %xmm9 = 0x0F0F... as in preheat
580 ## Clobbers %xmm1, %xmm2
582 .type _vpaes_schedule_transform,\@abi-omnipotent
584 _vpaes_schedule_transform:
590 movdqa (%r11), %xmm2 # lo
592 movdqa 16(%r11), %xmm0 # hi
597 .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
600 ## .aes_schedule_mangle
602 ## Mangle xmm0 from (basis-transformed) standard version
607 ## multiply by circulant 0,1,1,1
608 ## apply shiftrows transform
612 ## multiply by "inverse mixcolumns" circulant E,B,D,9
614 ## apply shiftrows transform
617 ## Writes out to (%rdx), and increments or decrements it
618 ## Keeps track of round number mod 4 in %r8
620 ## Clobbers xmm1-xmm5
622 .type _vpaes_schedule_mangle,\@abi-omnipotent
624 _vpaes_schedule_mangle:
626 movdqa %xmm0, %xmm4 # save xmm0 for later
627 movdqa .Lk_mc_forward(%rip),%xmm5
629 jnz .Lschedule_mangle_dec
633 pxor .Lk_s63(%rip),%xmm4
641 jmp .Lschedule_mangle_both
643 .Lschedule_mangle_dec:
644 # inverse mix columns
645 lea .Lk_dksd(%rip),%r11
648 psrld \$4, %xmm1 # 1 = hi
649 pand %xmm9, %xmm4 # 4 = lo
651 movdqa 0x00(%r11), %xmm2
653 movdqa 0x10(%r11), %xmm3
658 movdqa 0x20(%r11), %xmm2
661 movdqa 0x30(%r11), %xmm3
666 movdqa 0x40(%r11), %xmm2
669 movdqa 0x50(%r11), %xmm3
674 movdqa 0x60(%r11), %xmm2
677 movdqa 0x70(%r11), %xmm3
683 .Lschedule_mangle_both:
684 movdqa (%r8,%r10),%xmm1
691 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
694 # Interface to OpenSSL
696 .globl ${PREFIX}_set_encrypt_key
697 .type ${PREFIX}_set_encrypt_key,\@function,3
699 ${PREFIX}_set_encrypt_key:
703 $code.=<<___ if ($win64);
705 movaps %xmm6,0x10(%rsp)
706 movaps %xmm7,0x20(%rsp)
707 movaps %xmm8,0x30(%rsp)
708 movaps %xmm9,0x40(%rsp)
709 movaps %xmm10,0x50(%rsp)
710 movaps %xmm11,0x60(%rsp)
711 movaps %xmm12,0x70(%rsp)
712 movaps %xmm13,0x80(%rsp)
713 movaps %xmm14,0x90(%rsp)
714 movaps %xmm15,0xa0(%rsp)
721 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
725 call _vpaes_schedule_core
727 $code.=<<___ if ($win64);
728 movaps 0x10(%rsp),%xmm6
729 movaps 0x20(%rsp),%xmm7
730 movaps 0x30(%rsp),%xmm8
731 movaps 0x40(%rsp),%xmm9
732 movaps 0x50(%rsp),%xmm10
733 movaps 0x60(%rsp),%xmm11
734 movaps 0x70(%rsp),%xmm12
735 movaps 0x80(%rsp),%xmm13
736 movaps 0x90(%rsp),%xmm14
737 movaps 0xa0(%rsp),%xmm15
745 .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
747 .globl ${PREFIX}_set_decrypt_key
748 .type ${PREFIX}_set_decrypt_key,\@function,3
750 ${PREFIX}_set_decrypt_key:
754 $code.=<<___ if ($win64);
756 movaps %xmm6,0x10(%rsp)
757 movaps %xmm7,0x20(%rsp)
758 movaps %xmm8,0x30(%rsp)
759 movaps %xmm9,0x40(%rsp)
760 movaps %xmm10,0x50(%rsp)
761 movaps %xmm11,0x60(%rsp)
762 movaps %xmm12,0x70(%rsp)
763 movaps %xmm13,0x80(%rsp)
764 movaps %xmm14,0x90(%rsp)
765 movaps %xmm15,0xa0(%rsp)
772 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
774 lea 16(%rdx,%rax),%rdx
780 xor \$32,%r8d # nbits==192?0:32
781 call _vpaes_schedule_core
783 $code.=<<___ if ($win64);
784 movaps 0x10(%rsp),%xmm6
785 movaps 0x20(%rsp),%xmm7
786 movaps 0x30(%rsp),%xmm8
787 movaps 0x40(%rsp),%xmm9
788 movaps 0x50(%rsp),%xmm10
789 movaps 0x60(%rsp),%xmm11
790 movaps 0x70(%rsp),%xmm12
791 movaps 0x80(%rsp),%xmm13
792 movaps 0x90(%rsp),%xmm14
793 movaps 0xa0(%rsp),%xmm15
801 .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
803 .globl ${PREFIX}_encrypt
804 .type ${PREFIX}_encrypt,\@function,3
810 $code.=<<___ if ($win64);
812 movaps %xmm6,0x10(%rsp)
813 movaps %xmm7,0x20(%rsp)
814 movaps %xmm8,0x30(%rsp)
815 movaps %xmm9,0x40(%rsp)
816 movaps %xmm10,0x50(%rsp)
817 movaps %xmm11,0x60(%rsp)
818 movaps %xmm12,0x70(%rsp)
819 movaps %xmm13,0x80(%rsp)
820 movaps %xmm14,0x90(%rsp)
821 movaps %xmm15,0xa0(%rsp)
827 call _vpaes_encrypt_core
830 $code.=<<___ if ($win64);
831 movaps 0x10(%rsp),%xmm6
832 movaps 0x20(%rsp),%xmm7
833 movaps 0x30(%rsp),%xmm8
834 movaps 0x40(%rsp),%xmm9
835 movaps 0x50(%rsp),%xmm10
836 movaps 0x60(%rsp),%xmm11
837 movaps 0x70(%rsp),%xmm12
838 movaps 0x80(%rsp),%xmm13
839 movaps 0x90(%rsp),%xmm14
840 movaps 0xa0(%rsp),%xmm15
847 .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
849 .globl ${PREFIX}_decrypt
850 .type ${PREFIX}_decrypt,\@function,3
856 $code.=<<___ if ($win64);
858 movaps %xmm6,0x10(%rsp)
859 movaps %xmm7,0x20(%rsp)
860 movaps %xmm8,0x30(%rsp)
861 movaps %xmm9,0x40(%rsp)
862 movaps %xmm10,0x50(%rsp)
863 movaps %xmm11,0x60(%rsp)
864 movaps %xmm12,0x70(%rsp)
865 movaps %xmm13,0x80(%rsp)
866 movaps %xmm14,0x90(%rsp)
867 movaps %xmm15,0xa0(%rsp)
873 call _vpaes_decrypt_core
876 $code.=<<___ if ($win64);
877 movaps 0x10(%rsp),%xmm6
878 movaps 0x20(%rsp),%xmm7
879 movaps 0x30(%rsp),%xmm8
880 movaps 0x40(%rsp),%xmm9
881 movaps 0x50(%rsp),%xmm10
882 movaps 0x60(%rsp),%xmm11
883 movaps 0x70(%rsp),%xmm12
884 movaps 0x80(%rsp),%xmm13
885 movaps 0x90(%rsp),%xmm14
886 movaps 0xa0(%rsp),%xmm15
893 .size ${PREFIX}_decrypt,.-${PREFIX}_decrypt
896 my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
897 # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
898 # size_t length, const AES_KEY *key,
899 # unsigned char *ivp,const int enc);
901 .globl ${PREFIX}_cbc_encrypt
902 .type ${PREFIX}_cbc_encrypt,\@function,6
904 ${PREFIX}_cbc_encrypt:
909 ($len,$key)=($key,$len);
914 $code.=<<___ if ($win64);
916 movaps %xmm6,0x10(%rsp)
917 movaps %xmm7,0x20(%rsp)
918 movaps %xmm8,0x30(%rsp)
919 movaps %xmm9,0x40(%rsp)
920 movaps %xmm10,0x50(%rsp)
921 movaps %xmm11,0x60(%rsp)
922 movaps %xmm12,0x70(%rsp)
923 movaps %xmm13,0x80(%rsp)
924 movaps %xmm14,0x90(%rsp)
925 movaps %xmm15,0xa0(%rsp)
929 movdqu ($ivp),%xmm6 # load IV
939 call _vpaes_encrypt_core
941 movdqu %xmm0,($out,$inp)
950 call _vpaes_decrypt_core
953 movdqu %xmm0,($out,$inp)
958 movdqu %xmm6,($ivp) # save IV
960 $code.=<<___ if ($win64);
961 movaps 0x10(%rsp),%xmm6
962 movaps 0x20(%rsp),%xmm7
963 movaps 0x30(%rsp),%xmm8
964 movaps 0x40(%rsp),%xmm9
965 movaps 0x50(%rsp),%xmm10
966 movaps 0x60(%rsp),%xmm11
967 movaps 0x70(%rsp),%xmm12
968 movaps 0x80(%rsp),%xmm13
969 movaps 0x90(%rsp),%xmm14
970 movaps 0xa0(%rsp),%xmm15
978 .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
985 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
986 ## and %xmm9-%xmm15 as specified below.
988 .type _vpaes_preheat,\@abi-omnipotent
992 lea .Lk_s0F(%rip), %r10
993 movdqa -0x20(%r10), %xmm10 # .Lk_inv
994 movdqa -0x10(%r10), %xmm11 # .Lk_inv+16
995 movdqa 0x00(%r10), %xmm9 # .Lk_s0F
996 movdqa 0x30(%r10), %xmm13 # .Lk_sb1
997 movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16
998 movdqa 0x50(%r10), %xmm15 # .Lk_sb2
999 movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16
1002 .size _vpaes_preheat,.-_vpaes_preheat
1003 ########################################################
1007 ########################################################
1008 .type _vpaes_consts,\@object
1011 .Lk_inv: # inv, inva
1012 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
1013 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
1016 .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
1018 .Lk_ipt: # input transform (lo, hi)
1019 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
1020 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
1022 .Lk_sb1: # sb1u, sb1t
1023 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
1024 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
1025 .Lk_sb2: # sb2u, sb2t
1026 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
1027 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
1028 .Lk_sbo: # sbou, sbot
1029 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
1030 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
1032 .Lk_mc_forward: # mc_forward
1033 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
1034 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
1035 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
1036 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
1038 .Lk_mc_backward:# mc_backward
1039 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
1040 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
1041 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
1042 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
1045 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
1046 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
1047 .quad 0x0F060D040B020900, 0x070E050C030A0108
1048 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
1051 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
1053 .Lk_s63: # s63: all equal to 0x63 transformed
1054 .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
1056 .Lk_opt: # output transform
1057 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
1058 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
1060 .Lk_deskew: # deskew tables: inverts the sbox's "skew"
1061 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
1062 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
1066 ## Key schedule constants
1068 .Lk_dksd: # decryption key schedule: invskew x*D
1069 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
1070 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
1071 .Lk_dksb: # decryption key schedule: invskew x*B
1072 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
1073 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
1074 .Lk_dkse: # decryption key schedule: invskew x*E + 0x63
1075 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
1076 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
1077 .Lk_dks9: # decryption key schedule: invskew x*9
1078 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
1079 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
1083 ## Round function constants
1085 .Lk_dipt: # decryption input transform
1086 .quad 0x0F505B040B545F00, 0x154A411E114E451A
1087 .quad 0x86E383E660056500, 0x12771772F491F194
1089 .Lk_dsb9: # decryption sbox output *9*u, *9*t
1090 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
1091 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
1092 .Lk_dsbd: # decryption sbox output *D*u, *D*t
1093 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
1094 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
1095 .Lk_dsbb: # decryption sbox output *B*u, *B*t
1096 .quad 0xD022649296B44200, 0x602646F6B0F2D404
1097 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
1098 .Lk_dsbe: # decryption sbox output *E*u, *E*t
1099 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
1100 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
1101 .Lk_dsbo: # decryption sbox final output
1102 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
1103 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
1104 .asciz "Vector Permutation AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
1106 .size _vpaes_consts,.-_vpaes_consts
1110 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1111 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1118 .extern __imp_RtlVirtualUnwind
1119 .type se_handler,\@abi-omnipotent
1133 mov 120($context),%rax # pull context->Rax
1134 mov 248($context),%rbx # pull context->Rip
1136 mov 8($disp),%rsi # disp->ImageBase
1137 mov 56($disp),%r11 # disp->HandlerData
1139 mov 0(%r11),%r10d # HandlerData[0]
1140 lea (%rsi,%r10),%r10 # prologue label
1141 cmp %r10,%rbx # context->Rip<prologue label
1144 mov 152($context),%rax # pull context->Rsp
1146 mov 4(%r11),%r10d # HandlerData[1]
1147 lea (%rsi,%r10),%r10 # epilogue label
1148 cmp %r10,%rbx # context->Rip>=epilogue label
1151 lea 16(%rax),%rsi # %xmm save area
1152 lea 512($context),%rdi # &context.Xmm6
1153 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1154 .long 0xa548f3fc # cld; rep movsq
1155 lea 0xb8(%rax),%rax # adjust stack pointer
1160 mov %rax,152($context) # restore context->Rsp
1161 mov %rsi,168($context) # restore context->Rsi
1162 mov %rdi,176($context) # restore context->Rdi
1164 mov 40($disp),%rdi # disp->ContextRecord
1165 mov $context,%rsi # context
1166 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1167 .long 0xa548f3fc # cld; rep movsq
1170 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1171 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1172 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1173 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1174 mov 40(%rsi),%r10 # disp->ContextRecord
1175 lea 56(%rsi),%r11 # &disp->HandlerData
1176 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1177 mov %r10,32(%rsp) # arg5
1178 mov %r11,40(%rsp) # arg6
1179 mov %r12,48(%rsp) # arg7
1180 mov %rcx,56(%rsp) # arg8, (NULL)
1181 call *__imp_RtlVirtualUnwind(%rip)
1183 mov \$1,%eax # ExceptionContinueSearch
1195 .size se_handler,.-se_handler
1199 .rva .LSEH_begin_${PREFIX}_set_encrypt_key
1200 .rva .LSEH_end_${PREFIX}_set_encrypt_key
1201 .rva .LSEH_info_${PREFIX}_set_encrypt_key
1203 .rva .LSEH_begin_${PREFIX}_set_decrypt_key
1204 .rva .LSEH_end_${PREFIX}_set_decrypt_key
1205 .rva .LSEH_info_${PREFIX}_set_decrypt_key
1207 .rva .LSEH_begin_${PREFIX}_encrypt
1208 .rva .LSEH_end_${PREFIX}_encrypt
1209 .rva .LSEH_info_${PREFIX}_encrypt
1211 .rva .LSEH_begin_${PREFIX}_decrypt
1212 .rva .LSEH_end_${PREFIX}_decrypt
1213 .rva .LSEH_info_${PREFIX}_decrypt
1215 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
1216 .rva .LSEH_end_${PREFIX}_cbc_encrypt
1217 .rva .LSEH_info_${PREFIX}_cbc_encrypt
1221 .LSEH_info_${PREFIX}_set_encrypt_key:
1224 .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[]
1225 .LSEH_info_${PREFIX}_set_decrypt_key:
1228 .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[]
1229 .LSEH_info_${PREFIX}_encrypt:
1232 .rva .Lenc_body,.Lenc_epilogue # HandlerData[]
1233 .LSEH_info_${PREFIX}_decrypt:
1236 .rva .Ldec_body,.Ldec_epilogue # HandlerData[]
1237 .LSEH_info_${PREFIX}_cbc_encrypt:
1240 .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[]
1244 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1248 close STDOUT or die "error closing STDOUT: $!";