2 # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ######################################################################
11 ## Constant-time SSSE3 AES core implementation.
14 ## By Mike Hamburg (Stanford University), 2009
17 ## For details see http://shiftleft.org/papers/vector_aes/ and
18 ## http://crypto.stanford.edu/vpaes/.
20 ######################################################################
23 # Interface to OpenSSL as "almost" drop-in replacement for
24 # aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
25 # doesn't handle partial vectors (doesn't have to if called from
26 # EVP only). "Drop-in" implies that this module doesn't share key
27 # schedule structure with the original nor does it make assumption
28 # about its alignment...
30 # Performance summary. aes-x86_64.pl column lists large-block CBC
31 # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
32 # byte processed with 128-bit key, and vpaes-x86_64.pl column -
33 # [also large-block CBC] encrypt/decrypt.
35 # aes-x86_64.pl vpaes-x86_64.pl
37 # Core 2(**) 29.6/41.1/14.3 21.9/25.2(***)
38 # Nehalem 29.6/40.3/14.6 10.0/11.8
39 # Atom 57.3/74.2/32.1 60.9/77.2(***)
40 # Silvermont 52.7/64.0/19.5 48.8/60.8(***)
41 # Goldmont 38.9/49.0/17.8 10.6/12.6
43 # (*) "Hyper-threading" in the context refers rather to cache shared
44 # among multiple cores, than to specifically Intel HTT. As vast
45 # majority of contemporary cores share cache, slower code path
46 # is common place. In other words "with-hyper-threading-off"
47 # results are presented mostly for reference purposes.
49 # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
51 # (***) Less impressive improvement on Core 2 and Atom is due to slow
52 # pshufb, yet it's respectable +36%/62% improvement on Core 2
53 # (as implied, over "hyper-threading-safe" code path).
59 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
61 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
63 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
64 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
65 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
66 die "can't locate x86_64-xlate.pl";
68 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
83 ## %xmm9-%xmm15 as in _vpaes_preheat
84 ## (%rdx) = scheduled keys
87 ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
88 ## Preserves %xmm6 - %xmm8 so you get some local vectors
91 .type _vpaes_encrypt_core,\@abi-omnipotent
98 movdqa .Lk_ipt(%rip), %xmm2 # iptlo
100 movdqu (%r9), %xmm5 # round0 key
104 movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
109 lea .Lk_mc_backward(%rip),%r10
114 # middle of middle round
115 movdqa %xmm13, %xmm4 # 4 : sb1u
116 movdqa %xmm12, %xmm0 # 0 : sb1t
117 pshufb %xmm2, %xmm4 # 4 = sb1u
118 pshufb %xmm3, %xmm0 # 0 = sb1t
119 pxor %xmm5, %xmm4 # 4 = sb1u + k
120 movdqa %xmm15, %xmm5 # 4 : sb2u
121 pxor %xmm4, %xmm0 # 0 = A
122 movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
123 pshufb %xmm2, %xmm5 # 4 = sb2u
124 movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
125 movdqa %xmm14, %xmm2 # 2 : sb2t
126 pshufb %xmm3, %xmm2 # 2 = sb2t
127 movdqa %xmm0, %xmm3 # 3 = A
128 pxor %xmm5, %xmm2 # 2 = 2A
129 pshufb %xmm1, %xmm0 # 0 = B
130 add \$16, %r9 # next key
131 pxor %xmm2, %xmm0 # 0 = 2A+B
132 pshufb %xmm4, %xmm3 # 3 = D
133 add \$16, %r11 # next mc
134 pxor %xmm0, %xmm3 # 3 = 2A+B+D
135 pshufb %xmm1, %xmm0 # 0 = 2B+C
136 and \$0x30, %r11 # ... mod 4
138 pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
142 movdqa %xmm9, %xmm1 # 1 : i
143 movdqa %xmm11, %xmm5 # 2 : a/k
144 pandn %xmm0, %xmm1 # 1 = i<<4
145 psrld \$4, %xmm1 # 1 = i
146 pand %xmm9, %xmm0 # 0 = k
147 pshufb %xmm0, %xmm5 # 2 = a/k
148 movdqa %xmm10, %xmm3 # 3 : 1/i
149 pxor %xmm1, %xmm0 # 0 = j
150 pshufb %xmm1, %xmm3 # 3 = 1/i
151 movdqa %xmm10, %xmm4 # 4 : 1/j
152 pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
153 pshufb %xmm0, %xmm4 # 4 = 1/j
154 movdqa %xmm10, %xmm2 # 2 : 1/iak
155 pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
156 pshufb %xmm3, %xmm2 # 2 = 1/iak
157 movdqa %xmm10, %xmm3 # 3 : 1/jak
158 pxor %xmm0, %xmm2 # 2 = io
159 pshufb %xmm4, %xmm3 # 3 = 1/jak
161 pxor %xmm1, %xmm3 # 3 = jo
164 # middle of last round
165 movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
166 movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
167 pshufb %xmm2, %xmm4 # 4 = sbou
168 pxor %xmm5, %xmm4 # 4 = sb1u + k
169 pshufb %xmm3, %xmm0 # 0 = sb1t
170 movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
171 pxor %xmm4, %xmm0 # 0 = A
174 .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
179 ## Same API as encryption core.
181 .type _vpaes_decrypt_core,\@abi-omnipotent
184 mov %rdx, %r9 # load key
187 movdqa .Lk_dipt(%rip), %xmm2 # iptlo
191 movdqu (%r9), %xmm5 # round0 key
195 movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi
197 lea .Lk_dsbd(%rip),%r10
201 movdqa .Lk_mc_forward+48(%rip), %xmm5
210 ## Inverse mix columns
212 movdqa -0x20(%r10),%xmm4 # 4 : sb9u
213 movdqa -0x10(%r10),%xmm1 # 0 : sb9t
214 pshufb %xmm2, %xmm4 # 4 = sb9u
215 pshufb %xmm3, %xmm1 # 0 = sb9t
217 movdqa 0x00(%r10),%xmm4 # 4 : sbdu
218 pxor %xmm1, %xmm0 # 0 = ch
219 movdqa 0x10(%r10),%xmm1 # 0 : sbdt
221 pshufb %xmm2, %xmm4 # 4 = sbdu
222 pshufb %xmm5, %xmm0 # MC ch
223 pshufb %xmm3, %xmm1 # 0 = sbdt
224 pxor %xmm4, %xmm0 # 4 = ch
225 movdqa 0x20(%r10),%xmm4 # 4 : sbbu
226 pxor %xmm1, %xmm0 # 0 = ch
227 movdqa 0x30(%r10),%xmm1 # 0 : sbbt
229 pshufb %xmm2, %xmm4 # 4 = sbbu
230 pshufb %xmm5, %xmm0 # MC ch
231 pshufb %xmm3, %xmm1 # 0 = sbbt
232 pxor %xmm4, %xmm0 # 4 = ch
233 movdqa 0x40(%r10),%xmm4 # 4 : sbeu
234 pxor %xmm1, %xmm0 # 0 = ch
235 movdqa 0x50(%r10),%xmm1 # 0 : sbet
237 pshufb %xmm2, %xmm4 # 4 = sbeu
238 pshufb %xmm5, %xmm0 # MC ch
239 pshufb %xmm3, %xmm1 # 0 = sbet
240 pxor %xmm4, %xmm0 # 4 = ch
241 add \$16, %r9 # next round key
242 palignr \$12, %xmm5, %xmm5
243 pxor %xmm1, %xmm0 # 0 = ch
248 movdqa %xmm9, %xmm1 # 1 : i
249 pandn %xmm0, %xmm1 # 1 = i<<4
250 movdqa %xmm11, %xmm2 # 2 : a/k
251 psrld \$4, %xmm1 # 1 = i
252 pand %xmm9, %xmm0 # 0 = k
253 pshufb %xmm0, %xmm2 # 2 = a/k
254 movdqa %xmm10, %xmm3 # 3 : 1/i
255 pxor %xmm1, %xmm0 # 0 = j
256 pshufb %xmm1, %xmm3 # 3 = 1/i
257 movdqa %xmm10, %xmm4 # 4 : 1/j
258 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
259 pshufb %xmm0, %xmm4 # 4 = 1/j
260 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
261 movdqa %xmm10, %xmm2 # 2 : 1/iak
262 pshufb %xmm3, %xmm2 # 2 = 1/iak
263 movdqa %xmm10, %xmm3 # 3 : 1/jak
264 pxor %xmm0, %xmm2 # 2 = io
265 pshufb %xmm4, %xmm3 # 3 = 1/jak
267 pxor %xmm1, %xmm3 # 3 = jo
270 # middle of last round
271 movdqa 0x60(%r10), %xmm4 # 3 : sbou
272 pshufb %xmm2, %xmm4 # 4 = sbou
273 pxor %xmm0, %xmm4 # 4 = sb1u + k
274 movdqa 0x70(%r10), %xmm0 # 0 : sbot
275 movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
276 pshufb %xmm3, %xmm0 # 0 = sb1t
277 pxor %xmm4, %xmm0 # 0 = A
280 .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
282 ########################################################
284 ## AES key schedule ##
286 ########################################################
287 .type _vpaes_schedule_core,\@abi-omnipotent
289 _vpaes_schedule_core:
293 # rcx = direction. 0=encrypt, 1=decrypt
295 call _vpaes_preheat # load the tables
296 movdqa .Lk_rcon(%rip), %xmm8 # load rcon
297 movdqu (%rdi), %xmm0 # load key (unaligned)
301 lea .Lk_ipt(%rip), %r11
302 call _vpaes_schedule_transform
305 lea .Lk_sr(%rip),%r10
307 jnz .Lschedule_am_decrypting
309 # encrypting, output zeroth round key after transform
313 .Lschedule_am_decrypting:
314 # decrypting, output zeroth round key after shiftrows
315 movdqa (%r8,%r10),%xmm1
329 ## 128-bit specific part of key schedule.
331 ## This schedule is really simple, because all its parts
332 ## are accomplished by the subroutines.
338 call _vpaes_schedule_round
340 jz .Lschedule_mangle_last
341 call _vpaes_schedule_mangle # write output
342 jmp .Loop_schedule_128
347 ## 192-bit specific part of key schedule.
349 ## The main body of this schedule is the same as the 128-bit
350 ## schedule, but with more smearing. The long, high side is
351 ## stored in %xmm7 as before, and the short, low side is in
352 ## the high bits of %xmm6.
354 ## This schedule is somewhat nastier, however, because each
355 ## round produces 192 bits of key material, or 1.5 round keys.
356 ## Therefore, on each cycle we do 2 rounds and produce 3 round
361 movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
362 call _vpaes_schedule_transform # input transform
363 movdqa %xmm0, %xmm6 # save short part
364 pxor %xmm4, %xmm4 # clear 4
365 movhlps %xmm4, %xmm6 # clobber low side with zeros
369 call _vpaes_schedule_round
370 palignr \$8,%xmm6,%xmm0
371 call _vpaes_schedule_mangle # save key n
372 call _vpaes_schedule_192_smear
373 call _vpaes_schedule_mangle # save key n+1
374 call _vpaes_schedule_round
376 jz .Lschedule_mangle_last
377 call _vpaes_schedule_mangle # save key n+2
378 call _vpaes_schedule_192_smear
379 jmp .Loop_schedule_192
384 ## 256-bit specific part of key schedule.
386 ## The structure here is very similar to the 128-bit
387 ## schedule, but with an additional "low side" in
388 ## %xmm6. The low side's rounds are the same as the
389 ## high side's, except no rcon and no rotation.
393 movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
394 call _vpaes_schedule_transform # input transform
398 call _vpaes_schedule_mangle # output low result
399 movdqa %xmm0, %xmm6 # save cur_lo in xmm6
402 call _vpaes_schedule_round
404 jz .Lschedule_mangle_last
405 call _vpaes_schedule_mangle
407 # low round. swap xmm7 and xmm6
408 pshufd \$0xFF, %xmm0, %xmm0
411 call _vpaes_schedule_low_round
414 jmp .Loop_schedule_256
418 ## .aes_schedule_mangle_last
420 ## Mangler for last round of key schedule
422 ## when encrypting, outputs out(%xmm0) ^ 63
423 ## when decrypting, outputs unskew(%xmm0)
425 ## Always called right before return... jumps to cleanup and exits
428 .Lschedule_mangle_last:
429 # schedule last round key from xmm0
430 lea .Lk_deskew(%rip),%r11 # prepare to deskew
432 jnz .Lschedule_mangle_last_dec
435 movdqa (%r8,%r10),%xmm1
436 pshufb %xmm1, %xmm0 # output permute
437 lea .Lk_opt(%rip), %r11 # prepare to output transform
440 .Lschedule_mangle_last_dec:
442 pxor .Lk_s63(%rip), %xmm0
443 call _vpaes_schedule_transform # output transform
444 movdqu %xmm0, (%rdx) # save last key
456 .size _vpaes_schedule_core,.-_vpaes_schedule_core
459 ## .aes_schedule_192_smear
461 ## Smear the short, low side in the 192-bit key schedule.
464 ## %xmm7: high side, b a x y
465 ## %xmm6: low side, d c 0 0
469 ## %xmm6: b+c+d b+c 0 0
470 ## %xmm0: b+c+d b+c b a
472 .type _vpaes_schedule_192_smear,\@abi-omnipotent
474 _vpaes_schedule_192_smear:
475 pshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
476 pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
477 pxor %xmm1, %xmm6 # -> c+d c 0 0
479 pxor %xmm0, %xmm6 # -> b+c+d b+c b a
481 movhlps %xmm1, %xmm6 # clobber low side with zeros
483 .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
486 ## .aes_schedule_round
488 ## Runs one main round of the key schedule on %xmm0, %xmm7
490 ## Specifically, runs subbytes on the high dword of %xmm0
491 ## then rotates it by one byte and xors into the low dword of
494 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
497 ## Smears the dwords of %xmm7 by xoring the low into the
498 ## second low, result into third, result into highest.
500 ## Returns results in %xmm7 = %xmm0.
501 ## Clobbers %xmm1-%xmm4, %r11.
503 .type _vpaes_schedule_round,\@abi-omnipotent
505 _vpaes_schedule_round:
506 # extract rcon from xmm8
508 palignr \$15, %xmm8, %xmm1
509 palignr \$15, %xmm8, %xmm8
513 pshufd \$0xFF, %xmm0, %xmm0
514 palignr \$1, %xmm0, %xmm0
518 # low round: same as high round, but no rotation and no rcon.
519 _vpaes_schedule_low_round:
527 pxor .Lk_s63(%rip), %xmm7
532 psrld \$4, %xmm1 # 1 = i
533 pand %xmm9, %xmm0 # 0 = k
534 movdqa %xmm11, %xmm2 # 2 : a/k
535 pshufb %xmm0, %xmm2 # 2 = a/k
536 pxor %xmm1, %xmm0 # 0 = j
537 movdqa %xmm10, %xmm3 # 3 : 1/i
538 pshufb %xmm1, %xmm3 # 3 = 1/i
539 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
540 movdqa %xmm10, %xmm4 # 4 : 1/j
541 pshufb %xmm0, %xmm4 # 4 = 1/j
542 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
543 movdqa %xmm10, %xmm2 # 2 : 1/iak
544 pshufb %xmm3, %xmm2 # 2 = 1/iak
545 pxor %xmm0, %xmm2 # 2 = io
546 movdqa %xmm10, %xmm3 # 3 : 1/jak
547 pshufb %xmm4, %xmm3 # 3 = 1/jak
548 pxor %xmm1, %xmm3 # 3 = jo
549 movdqa %xmm13, %xmm4 # 4 : sbou
550 pshufb %xmm2, %xmm4 # 4 = sbou
551 movdqa %xmm12, %xmm0 # 0 : sbot
552 pshufb %xmm3, %xmm0 # 0 = sb1t
553 pxor %xmm4, %xmm0 # 0 = sbox output
555 # add in smeared stuff
559 .size _vpaes_schedule_round,.-_vpaes_schedule_round
562 ## .aes_schedule_transform
564 ## Linear-transform %xmm0 according to tables at (%r11)
566 ## Requires that %xmm9 = 0x0F0F... as in preheat
568 ## Clobbers %xmm1, %xmm2
570 .type _vpaes_schedule_transform,\@abi-omnipotent
572 _vpaes_schedule_transform:
577 movdqa (%r11), %xmm2 # lo
579 movdqa 16(%r11), %xmm0 # hi
583 .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
586 ## .aes_schedule_mangle
588 ## Mangle xmm0 from (basis-transformed) standard version
593 ## multiply by circulant 0,1,1,1
594 ## apply shiftrows transform
598 ## multiply by "inverse mixcolumns" circulant E,B,D,9
600 ## apply shiftrows transform
603 ## Writes out to (%rdx), and increments or decrements it
604 ## Keeps track of round number mod 4 in %r8
606 ## Clobbers xmm1-xmm5
608 .type _vpaes_schedule_mangle,\@abi-omnipotent
610 _vpaes_schedule_mangle:
611 movdqa %xmm0, %xmm4 # save xmm0 for later
612 movdqa .Lk_mc_forward(%rip),%xmm5
614 jnz .Lschedule_mangle_dec
618 pxor .Lk_s63(%rip),%xmm4
626 jmp .Lschedule_mangle_both
628 .Lschedule_mangle_dec:
629 # inverse mix columns
630 lea .Lk_dksd(%rip),%r11
633 psrld \$4, %xmm1 # 1 = hi
634 pand %xmm9, %xmm4 # 4 = lo
636 movdqa 0x00(%r11), %xmm2
638 movdqa 0x10(%r11), %xmm3
643 movdqa 0x20(%r11), %xmm2
646 movdqa 0x30(%r11), %xmm3
651 movdqa 0x40(%r11), %xmm2
654 movdqa 0x50(%r11), %xmm3
659 movdqa 0x60(%r11), %xmm2
662 movdqa 0x70(%r11), %xmm3
668 .Lschedule_mangle_both:
669 movdqa (%r8,%r10),%xmm1
675 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
678 # Interface to OpenSSL
680 .globl ${PREFIX}_set_encrypt_key
681 .type ${PREFIX}_set_encrypt_key,\@function,3
683 ${PREFIX}_set_encrypt_key:
685 $code.=<<___ if ($win64);
687 movaps %xmm6,0x10(%rsp)
688 movaps %xmm7,0x20(%rsp)
689 movaps %xmm8,0x30(%rsp)
690 movaps %xmm9,0x40(%rsp)
691 movaps %xmm10,0x50(%rsp)
692 movaps %xmm11,0x60(%rsp)
693 movaps %xmm12,0x70(%rsp)
694 movaps %xmm13,0x80(%rsp)
695 movaps %xmm14,0x90(%rsp)
696 movaps %xmm15,0xa0(%rsp)
703 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
707 call _vpaes_schedule_core
709 $code.=<<___ if ($win64);
710 movaps 0x10(%rsp),%xmm6
711 movaps 0x20(%rsp),%xmm7
712 movaps 0x30(%rsp),%xmm8
713 movaps 0x40(%rsp),%xmm9
714 movaps 0x50(%rsp),%xmm10
715 movaps 0x60(%rsp),%xmm11
716 movaps 0x70(%rsp),%xmm12
717 movaps 0x80(%rsp),%xmm13
718 movaps 0x90(%rsp),%xmm14
719 movaps 0xa0(%rsp),%xmm15
726 .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
728 .globl ${PREFIX}_set_decrypt_key
729 .type ${PREFIX}_set_decrypt_key,\@function,3
731 ${PREFIX}_set_decrypt_key:
733 $code.=<<___ if ($win64);
735 movaps %xmm6,0x10(%rsp)
736 movaps %xmm7,0x20(%rsp)
737 movaps %xmm8,0x30(%rsp)
738 movaps %xmm9,0x40(%rsp)
739 movaps %xmm10,0x50(%rsp)
740 movaps %xmm11,0x60(%rsp)
741 movaps %xmm12,0x70(%rsp)
742 movaps %xmm13,0x80(%rsp)
743 movaps %xmm14,0x90(%rsp)
744 movaps %xmm15,0xa0(%rsp)
751 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
753 lea 16(%rdx,%rax),%rdx
759 xor \$32,%r8d # nbits==192?0:32
760 call _vpaes_schedule_core
762 $code.=<<___ if ($win64);
763 movaps 0x10(%rsp),%xmm6
764 movaps 0x20(%rsp),%xmm7
765 movaps 0x30(%rsp),%xmm8
766 movaps 0x40(%rsp),%xmm9
767 movaps 0x50(%rsp),%xmm10
768 movaps 0x60(%rsp),%xmm11
769 movaps 0x70(%rsp),%xmm12
770 movaps 0x80(%rsp),%xmm13
771 movaps 0x90(%rsp),%xmm14
772 movaps 0xa0(%rsp),%xmm15
779 .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
781 .globl ${PREFIX}_encrypt
782 .type ${PREFIX}_encrypt,\@function,3
786 $code.=<<___ if ($win64);
788 movaps %xmm6,0x10(%rsp)
789 movaps %xmm7,0x20(%rsp)
790 movaps %xmm8,0x30(%rsp)
791 movaps %xmm9,0x40(%rsp)
792 movaps %xmm10,0x50(%rsp)
793 movaps %xmm11,0x60(%rsp)
794 movaps %xmm12,0x70(%rsp)
795 movaps %xmm13,0x80(%rsp)
796 movaps %xmm14,0x90(%rsp)
797 movaps %xmm15,0xa0(%rsp)
803 call _vpaes_encrypt_core
806 $code.=<<___ if ($win64);
807 movaps 0x10(%rsp),%xmm6
808 movaps 0x20(%rsp),%xmm7
809 movaps 0x30(%rsp),%xmm8
810 movaps 0x40(%rsp),%xmm9
811 movaps 0x50(%rsp),%xmm10
812 movaps 0x60(%rsp),%xmm11
813 movaps 0x70(%rsp),%xmm12
814 movaps 0x80(%rsp),%xmm13
815 movaps 0x90(%rsp),%xmm14
816 movaps 0xa0(%rsp),%xmm15
822 .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
824 .globl ${PREFIX}_decrypt
825 .type ${PREFIX}_decrypt,\@function,3
829 $code.=<<___ if ($win64);
831 movaps %xmm6,0x10(%rsp)
832 movaps %xmm7,0x20(%rsp)
833 movaps %xmm8,0x30(%rsp)
834 movaps %xmm9,0x40(%rsp)
835 movaps %xmm10,0x50(%rsp)
836 movaps %xmm11,0x60(%rsp)
837 movaps %xmm12,0x70(%rsp)
838 movaps %xmm13,0x80(%rsp)
839 movaps %xmm14,0x90(%rsp)
840 movaps %xmm15,0xa0(%rsp)
846 call _vpaes_decrypt_core
849 $code.=<<___ if ($win64);
850 movaps 0x10(%rsp),%xmm6
851 movaps 0x20(%rsp),%xmm7
852 movaps 0x30(%rsp),%xmm8
853 movaps 0x40(%rsp),%xmm9
854 movaps 0x50(%rsp),%xmm10
855 movaps 0x60(%rsp),%xmm11
856 movaps 0x70(%rsp),%xmm12
857 movaps 0x80(%rsp),%xmm13
858 movaps 0x90(%rsp),%xmm14
859 movaps 0xa0(%rsp),%xmm15
865 .size ${PREFIX}_decrypt,.-${PREFIX}_decrypt
868 my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
869 # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
870 # size_t length, const AES_KEY *key,
871 # unsigned char *ivp,const int enc);
873 .globl ${PREFIX}_cbc_encrypt
874 .type ${PREFIX}_cbc_encrypt,\@function,6
876 ${PREFIX}_cbc_encrypt:
879 ($len,$key)=($key,$len);
884 $code.=<<___ if ($win64);
886 movaps %xmm6,0x10(%rsp)
887 movaps %xmm7,0x20(%rsp)
888 movaps %xmm8,0x30(%rsp)
889 movaps %xmm9,0x40(%rsp)
890 movaps %xmm10,0x50(%rsp)
891 movaps %xmm11,0x60(%rsp)
892 movaps %xmm12,0x70(%rsp)
893 movaps %xmm13,0x80(%rsp)
894 movaps %xmm14,0x90(%rsp)
895 movaps %xmm15,0xa0(%rsp)
899 movdqu ($ivp),%xmm6 # load IV
909 call _vpaes_encrypt_core
911 movdqu %xmm0,($out,$inp)
920 call _vpaes_decrypt_core
923 movdqu %xmm0,($out,$inp)
928 movdqu %xmm6,($ivp) # save IV
930 $code.=<<___ if ($win64);
931 movaps 0x10(%rsp),%xmm6
932 movaps 0x20(%rsp),%xmm7
933 movaps 0x30(%rsp),%xmm8
934 movaps 0x40(%rsp),%xmm9
935 movaps 0x50(%rsp),%xmm10
936 movaps 0x60(%rsp),%xmm11
937 movaps 0x70(%rsp),%xmm12
938 movaps 0x80(%rsp),%xmm13
939 movaps 0x90(%rsp),%xmm14
940 movaps 0xa0(%rsp),%xmm15
947 .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
954 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
955 ## and %xmm9-%xmm15 as specified below.
957 .type _vpaes_preheat,\@abi-omnipotent
960 lea .Lk_s0F(%rip), %r10
961 movdqa -0x20(%r10), %xmm10 # .Lk_inv
962 movdqa -0x10(%r10), %xmm11 # .Lk_inv+16
963 movdqa 0x00(%r10), %xmm9 # .Lk_s0F
964 movdqa 0x30(%r10), %xmm13 # .Lk_sb1
965 movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16
966 movdqa 0x50(%r10), %xmm15 # .Lk_sb2
967 movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16
969 .size _vpaes_preheat,.-_vpaes_preheat
970 ########################################################
974 ########################################################
975 .type _vpaes_consts,\@object
979 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
980 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
983 .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
985 .Lk_ipt: # input transform (lo, hi)
986 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
987 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
989 .Lk_sb1: # sb1u, sb1t
990 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
991 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
992 .Lk_sb2: # sb2u, sb2t
993 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
994 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
995 .Lk_sbo: # sbou, sbot
996 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
997 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
999 .Lk_mc_forward: # mc_forward
1000 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
1001 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
1002 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
1003 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
1005 .Lk_mc_backward:# mc_backward
1006 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
1007 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
1008 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
1009 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
1012 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
1013 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
1014 .quad 0x0F060D040B020900, 0x070E050C030A0108
1015 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
1018 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
1020 .Lk_s63: # s63: all equal to 0x63 transformed
1021 .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
1023 .Lk_opt: # output transform
1024 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
1025 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
1027 .Lk_deskew: # deskew tables: inverts the sbox's "skew"
1028 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
1029 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
1033 ## Key schedule constants
1035 .Lk_dksd: # decryption key schedule: invskew x*D
1036 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
1037 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
1038 .Lk_dksb: # decryption key schedule: invskew x*B
1039 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
1040 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
1041 .Lk_dkse: # decryption key schedule: invskew x*E + 0x63
1042 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
1043 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
1044 .Lk_dks9: # decryption key schedule: invskew x*9
1045 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
1046 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
1050 ## Round function constants
1052 .Lk_dipt: # decryption input transform
1053 .quad 0x0F505B040B545F00, 0x154A411E114E451A
1054 .quad 0x86E383E660056500, 0x12771772F491F194
1056 .Lk_dsb9: # decryption sbox output *9*u, *9*t
1057 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
1058 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
1059 .Lk_dsbd: # decryption sbox output *D*u, *D*t
1060 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
1061 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
1062 .Lk_dsbb: # decryption sbox output *B*u, *B*t
1063 .quad 0xD022649296B44200, 0x602646F6B0F2D404
1064 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
1065 .Lk_dsbe: # decryption sbox output *E*u, *E*t
1066 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
1067 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
1068 .Lk_dsbo: # decryption sbox final output
1069 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
1070 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
1071 .asciz "Vector Permutation AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
1073 .size _vpaes_consts,.-_vpaes_consts
1077 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1078 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1085 .extern __imp_RtlVirtualUnwind
1086 .type se_handler,\@abi-omnipotent
1100 mov 120($context),%rax # pull context->Rax
1101 mov 248($context),%rbx # pull context->Rip
1103 mov 8($disp),%rsi # disp->ImageBase
1104 mov 56($disp),%r11 # disp->HandlerData
1106 mov 0(%r11),%r10d # HandlerData[0]
1107 lea (%rsi,%r10),%r10 # prologue label
1108 cmp %r10,%rbx # context->Rip<prologue label
1111 mov 152($context),%rax # pull context->Rsp
1113 mov 4(%r11),%r10d # HandlerData[1]
1114 lea (%rsi,%r10),%r10 # epilogue label
1115 cmp %r10,%rbx # context->Rip>=epilogue label
1118 lea 16(%rax),%rsi # %xmm save area
1119 lea 512($context),%rdi # &context.Xmm6
1120 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1121 .long 0xa548f3fc # cld; rep movsq
1122 lea 0xb8(%rax),%rax # adjust stack pointer
1127 mov %rax,152($context) # restore context->Rsp
1128 mov %rsi,168($context) # restore context->Rsi
1129 mov %rdi,176($context) # restore context->Rdi
1131 mov 40($disp),%rdi # disp->ContextRecord
1132 mov $context,%rsi # context
1133 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1134 .long 0xa548f3fc # cld; rep movsq
1137 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1138 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1139 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1140 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1141 mov 40(%rsi),%r10 # disp->ContextRecord
1142 lea 56(%rsi),%r11 # &disp->HandlerData
1143 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1144 mov %r10,32(%rsp) # arg5
1145 mov %r11,40(%rsp) # arg6
1146 mov %r12,48(%rsp) # arg7
1147 mov %rcx,56(%rsp) # arg8, (NULL)
1148 call *__imp_RtlVirtualUnwind(%rip)
1150 mov \$1,%eax # ExceptionContinueSearch
1162 .size se_handler,.-se_handler
1166 .rva .LSEH_begin_${PREFIX}_set_encrypt_key
1167 .rva .LSEH_end_${PREFIX}_set_encrypt_key
1168 .rva .LSEH_info_${PREFIX}_set_encrypt_key
1170 .rva .LSEH_begin_${PREFIX}_set_decrypt_key
1171 .rva .LSEH_end_${PREFIX}_set_decrypt_key
1172 .rva .LSEH_info_${PREFIX}_set_decrypt_key
1174 .rva .LSEH_begin_${PREFIX}_encrypt
1175 .rva .LSEH_end_${PREFIX}_encrypt
1176 .rva .LSEH_info_${PREFIX}_encrypt
1178 .rva .LSEH_begin_${PREFIX}_decrypt
1179 .rva .LSEH_end_${PREFIX}_decrypt
1180 .rva .LSEH_info_${PREFIX}_decrypt
1182 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
1183 .rva .LSEH_end_${PREFIX}_cbc_encrypt
1184 .rva .LSEH_info_${PREFIX}_cbc_encrypt
1188 .LSEH_info_${PREFIX}_set_encrypt_key:
1191 .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[]
1192 .LSEH_info_${PREFIX}_set_decrypt_key:
1195 .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[]
1196 .LSEH_info_${PREFIX}_encrypt:
1199 .rva .Lenc_body,.Lenc_epilogue # HandlerData[]
1200 .LSEH_info_${PREFIX}_decrypt:
1203 .rva .Ldec_body,.Ldec_epilogue # HandlerData[]
1204 .LSEH_info_${PREFIX}_cbc_encrypt:
1207 .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[]
1211 $code =~ s/\`([^\`]*)\`/eval($1)/gem;