2 # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ######################################################################
11 ## Constant-time SSSE3 AES core implementation.
14 ## By Mike Hamburg (Stanford University), 2009
17 ## For details see http://shiftleft.org/papers/vector_aes/ and
18 ## http://crypto.stanford.edu/vpaes/.
20 ######################################################################
23 # Interface to OpenSSL as "almost" drop-in replacement for
24 # aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
25 # doesn't handle partial vectors (doesn't have to if called from
26 # EVP only). "Drop-in" implies that this module doesn't share key
27 # schedule structure with the original nor does it make assumption
28 # about its alignment...
30 # Performance summary. aes-x86_64.pl column lists large-block CBC
31 # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
32 # byte processed with 128-bit key, and vpaes-x86_64.pl column -
33 # [also large-block CBC] encrypt/decrypt.
35 # aes-x86_64.pl vpaes-x86_64.pl
37 # Core 2(**) 29.6/41.1/14.3 21.9/25.2(***)
38 # Nehalem 29.6/40.3/14.6 10.0/11.8
39 # Atom 57.3/74.2/32.1 60.9/77.2(***)
40 # Silvermont 52.7/64.0/19.5 48.8/60.8(***)
42 # (*) "Hyper-threading" in the context refers rather to cache shared
43 # among multiple cores, than to specifically Intel HTT. As vast
44 # majority of contemporary cores share cache, slower code path
45 # is common place. In other words "with-hyper-threading-off"
46 # results are presented mostly for reference purposes.
48 # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
50 # (***) Less impressive improvement on Core 2 and Atom is due to slow
51 # pshufb, yet it's respectable +36%/62% improvement on Core 2
52 # (as implied, over "hyper-threading-safe" code path).
58 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
60 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
62 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
64 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
65 die "can't locate x86_64-xlate.pl";
67 open OUT,"| \"$^X\" $xlate $flavour $output";
82 ## %xmm9-%xmm15 as in _vpaes_preheat
83 ## (%rdx) = scheduled keys
86 ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
87 ## Preserves %xmm6 - %xmm8 so you get some local vectors
90 .type _vpaes_encrypt_core,\@abi-omnipotent
97 movdqa .Lk_ipt(%rip), %xmm2 # iptlo
99 movdqu (%r9), %xmm5 # round0 key
103 movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
108 lea .Lk_mc_backward(%rip),%r10
113 # middle of middle round
114 movdqa %xmm13, %xmm4 # 4 : sb1u
115 movdqa %xmm12, %xmm0 # 0 : sb1t
116 pshufb %xmm2, %xmm4 # 4 = sb1u
117 pshufb %xmm3, %xmm0 # 0 = sb1t
118 pxor %xmm5, %xmm4 # 4 = sb1u + k
119 movdqa %xmm15, %xmm5 # 4 : sb2u
120 pxor %xmm4, %xmm0 # 0 = A
121 movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
122 pshufb %xmm2, %xmm5 # 4 = sb2u
123 movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
124 movdqa %xmm14, %xmm2 # 2 : sb2t
125 pshufb %xmm3, %xmm2 # 2 = sb2t
126 movdqa %xmm0, %xmm3 # 3 = A
127 pxor %xmm5, %xmm2 # 2 = 2A
128 pshufb %xmm1, %xmm0 # 0 = B
129 add \$16, %r9 # next key
130 pxor %xmm2, %xmm0 # 0 = 2A+B
131 pshufb %xmm4, %xmm3 # 3 = D
132 add \$16, %r11 # next mc
133 pxor %xmm0, %xmm3 # 3 = 2A+B+D
134 pshufb %xmm1, %xmm0 # 0 = 2B+C
135 and \$0x30, %r11 # ... mod 4
137 pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
141 movdqa %xmm9, %xmm1 # 1 : i
142 movdqa %xmm11, %xmm5 # 2 : a/k
143 pandn %xmm0, %xmm1 # 1 = i<<4
144 psrld \$4, %xmm1 # 1 = i
145 pand %xmm9, %xmm0 # 0 = k
146 pshufb %xmm0, %xmm5 # 2 = a/k
147 movdqa %xmm10, %xmm3 # 3 : 1/i
148 pxor %xmm1, %xmm0 # 0 = j
149 pshufb %xmm1, %xmm3 # 3 = 1/i
150 movdqa %xmm10, %xmm4 # 4 : 1/j
151 pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
152 pshufb %xmm0, %xmm4 # 4 = 1/j
153 movdqa %xmm10, %xmm2 # 2 : 1/iak
154 pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
155 pshufb %xmm3, %xmm2 # 2 = 1/iak
156 movdqa %xmm10, %xmm3 # 3 : 1/jak
157 pxor %xmm0, %xmm2 # 2 = io
158 pshufb %xmm4, %xmm3 # 3 = 1/jak
160 pxor %xmm1, %xmm3 # 3 = jo
163 # middle of last round
164 movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
165 movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
166 pshufb %xmm2, %xmm4 # 4 = sbou
167 pxor %xmm5, %xmm4 # 4 = sb1u + k
168 pshufb %xmm3, %xmm0 # 0 = sb1t
169 movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
170 pxor %xmm4, %xmm0 # 0 = A
173 .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
178 ## Same API as encryption core.
180 .type _vpaes_decrypt_core,\@abi-omnipotent
183 mov %rdx, %r9 # load key
186 movdqa .Lk_dipt(%rip), %xmm2 # iptlo
190 movdqu (%r9), %xmm5 # round0 key
194 movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi
196 lea .Lk_dsbd(%rip),%r10
200 movdqa .Lk_mc_forward+48(%rip), %xmm5
209 ## Inverse mix columns
211 movdqa -0x20(%r10),%xmm4 # 4 : sb9u
212 movdqa -0x10(%r10),%xmm1 # 0 : sb9t
213 pshufb %xmm2, %xmm4 # 4 = sb9u
214 pshufb %xmm3, %xmm1 # 0 = sb9t
216 movdqa 0x00(%r10),%xmm4 # 4 : sbdu
217 pxor %xmm1, %xmm0 # 0 = ch
218 movdqa 0x10(%r10),%xmm1 # 0 : sbdt
220 pshufb %xmm2, %xmm4 # 4 = sbdu
221 pshufb %xmm5, %xmm0 # MC ch
222 pshufb %xmm3, %xmm1 # 0 = sbdt
223 pxor %xmm4, %xmm0 # 4 = ch
224 movdqa 0x20(%r10),%xmm4 # 4 : sbbu
225 pxor %xmm1, %xmm0 # 0 = ch
226 movdqa 0x30(%r10),%xmm1 # 0 : sbbt
228 pshufb %xmm2, %xmm4 # 4 = sbbu
229 pshufb %xmm5, %xmm0 # MC ch
230 pshufb %xmm3, %xmm1 # 0 = sbbt
231 pxor %xmm4, %xmm0 # 4 = ch
232 movdqa 0x40(%r10),%xmm4 # 4 : sbeu
233 pxor %xmm1, %xmm0 # 0 = ch
234 movdqa 0x50(%r10),%xmm1 # 0 : sbet
236 pshufb %xmm2, %xmm4 # 4 = sbeu
237 pshufb %xmm5, %xmm0 # MC ch
238 pshufb %xmm3, %xmm1 # 0 = sbet
239 pxor %xmm4, %xmm0 # 4 = ch
240 add \$16, %r9 # next round key
241 palignr \$12, %xmm5, %xmm5
242 pxor %xmm1, %xmm0 # 0 = ch
247 movdqa %xmm9, %xmm1 # 1 : i
248 pandn %xmm0, %xmm1 # 1 = i<<4
249 movdqa %xmm11, %xmm2 # 2 : a/k
250 psrld \$4, %xmm1 # 1 = i
251 pand %xmm9, %xmm0 # 0 = k
252 pshufb %xmm0, %xmm2 # 2 = a/k
253 movdqa %xmm10, %xmm3 # 3 : 1/i
254 pxor %xmm1, %xmm0 # 0 = j
255 pshufb %xmm1, %xmm3 # 3 = 1/i
256 movdqa %xmm10, %xmm4 # 4 : 1/j
257 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
258 pshufb %xmm0, %xmm4 # 4 = 1/j
259 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
260 movdqa %xmm10, %xmm2 # 2 : 1/iak
261 pshufb %xmm3, %xmm2 # 2 = 1/iak
262 movdqa %xmm10, %xmm3 # 3 : 1/jak
263 pxor %xmm0, %xmm2 # 2 = io
264 pshufb %xmm4, %xmm3 # 3 = 1/jak
266 pxor %xmm1, %xmm3 # 3 = jo
269 # middle of last round
270 movdqa 0x60(%r10), %xmm4 # 3 : sbou
271 pshufb %xmm2, %xmm4 # 4 = sbou
272 pxor %xmm0, %xmm4 # 4 = sb1u + k
273 movdqa 0x70(%r10), %xmm0 # 0 : sbot
274 movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
275 pshufb %xmm3, %xmm0 # 0 = sb1t
276 pxor %xmm4, %xmm0 # 0 = A
279 .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
281 ########################################################
283 ## AES key schedule ##
285 ########################################################
286 .type _vpaes_schedule_core,\@abi-omnipotent
288 _vpaes_schedule_core:
292 # rcx = direction. 0=encrypt, 1=decrypt
294 call _vpaes_preheat # load the tables
295 movdqa .Lk_rcon(%rip), %xmm8 # load rcon
296 movdqu (%rdi), %xmm0 # load key (unaligned)
300 lea .Lk_ipt(%rip), %r11
301 call _vpaes_schedule_transform
304 lea .Lk_sr(%rip),%r10
306 jnz .Lschedule_am_decrypting
308 # encrypting, output zeroth round key after transform
312 .Lschedule_am_decrypting:
313 # decrypting, output zeroth round key after shiftrows
314 movdqa (%r8,%r10),%xmm1
328 ## 128-bit specific part of key schedule.
330 ## This schedule is really simple, because all its parts
331 ## are accomplished by the subroutines.
337 call _vpaes_schedule_round
339 jz .Lschedule_mangle_last
340 call _vpaes_schedule_mangle # write output
341 jmp .Loop_schedule_128
346 ## 192-bit specific part of key schedule.
348 ## The main body of this schedule is the same as the 128-bit
349 ## schedule, but with more smearing. The long, high side is
350 ## stored in %xmm7 as before, and the short, low side is in
351 ## the high bits of %xmm6.
353 ## This schedule is somewhat nastier, however, because each
354 ## round produces 192 bits of key material, or 1.5 round keys.
355 ## Therefore, on each cycle we do 2 rounds and produce 3 round
360 movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
361 call _vpaes_schedule_transform # input transform
362 movdqa %xmm0, %xmm6 # save short part
363 pxor %xmm4, %xmm4 # clear 4
364 movhlps %xmm4, %xmm6 # clobber low side with zeros
368 call _vpaes_schedule_round
369 palignr \$8,%xmm6,%xmm0
370 call _vpaes_schedule_mangle # save key n
371 call _vpaes_schedule_192_smear
372 call _vpaes_schedule_mangle # save key n+1
373 call _vpaes_schedule_round
375 jz .Lschedule_mangle_last
376 call _vpaes_schedule_mangle # save key n+2
377 call _vpaes_schedule_192_smear
378 jmp .Loop_schedule_192
383 ## 256-bit specific part of key schedule.
385 ## The structure here is very similar to the 128-bit
386 ## schedule, but with an additional "low side" in
387 ## %xmm6. The low side's rounds are the same as the
388 ## high side's, except no rcon and no rotation.
392 movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
393 call _vpaes_schedule_transform # input transform
397 call _vpaes_schedule_mangle # output low result
398 movdqa %xmm0, %xmm6 # save cur_lo in xmm6
401 call _vpaes_schedule_round
403 jz .Lschedule_mangle_last
404 call _vpaes_schedule_mangle
406 # low round. swap xmm7 and xmm6
407 pshufd \$0xFF, %xmm0, %xmm0
410 call _vpaes_schedule_low_round
413 jmp .Loop_schedule_256
417 ## .aes_schedule_mangle_last
419 ## Mangler for last round of key schedule
421 ## when encrypting, outputs out(%xmm0) ^ 63
422 ## when decrypting, outputs unskew(%xmm0)
424 ## Always called right before return... jumps to cleanup and exits
427 .Lschedule_mangle_last:
428 # schedule last round key from xmm0
429 lea .Lk_deskew(%rip),%r11 # prepare to deskew
431 jnz .Lschedule_mangle_last_dec
434 movdqa (%r8,%r10),%xmm1
435 pshufb %xmm1, %xmm0 # output permute
436 lea .Lk_opt(%rip), %r11 # prepare to output transform
439 .Lschedule_mangle_last_dec:
441 pxor .Lk_s63(%rip), %xmm0
442 call _vpaes_schedule_transform # output transform
443 movdqu %xmm0, (%rdx) # save last key
455 .size _vpaes_schedule_core,.-_vpaes_schedule_core
458 ## .aes_schedule_192_smear
460 ## Smear the short, low side in the 192-bit key schedule.
463 ## %xmm7: high side, b a x y
464 ## %xmm6: low side, d c 0 0
468 ## %xmm6: b+c+d b+c 0 0
469 ## %xmm0: b+c+d b+c b a
471 .type _vpaes_schedule_192_smear,\@abi-omnipotent
473 _vpaes_schedule_192_smear:
474 pshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
475 pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
476 pxor %xmm1, %xmm6 # -> c+d c 0 0
478 pxor %xmm0, %xmm6 # -> b+c+d b+c b a
480 movhlps %xmm1, %xmm6 # clobber low side with zeros
482 .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
485 ## .aes_schedule_round
487 ## Runs one main round of the key schedule on %xmm0, %xmm7
489 ## Specifically, runs subbytes on the high dword of %xmm0
490 ## then rotates it by one byte and xors into the low dword of
493 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
496 ## Smears the dwords of %xmm7 by xoring the low into the
497 ## second low, result into third, result into highest.
499 ## Returns results in %xmm7 = %xmm0.
500 ## Clobbers %xmm1-%xmm4, %r11.
502 .type _vpaes_schedule_round,\@abi-omnipotent
504 _vpaes_schedule_round:
505 # extract rcon from xmm8
507 palignr \$15, %xmm8, %xmm1
508 palignr \$15, %xmm8, %xmm8
512 pshufd \$0xFF, %xmm0, %xmm0
513 palignr \$1, %xmm0, %xmm0
517 # low round: same as high round, but no rotation and no rcon.
518 _vpaes_schedule_low_round:
526 pxor .Lk_s63(%rip), %xmm7
531 psrld \$4, %xmm1 # 1 = i
532 pand %xmm9, %xmm0 # 0 = k
533 movdqa %xmm11, %xmm2 # 2 : a/k
534 pshufb %xmm0, %xmm2 # 2 = a/k
535 pxor %xmm1, %xmm0 # 0 = j
536 movdqa %xmm10, %xmm3 # 3 : 1/i
537 pshufb %xmm1, %xmm3 # 3 = 1/i
538 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
539 movdqa %xmm10, %xmm4 # 4 : 1/j
540 pshufb %xmm0, %xmm4 # 4 = 1/j
541 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
542 movdqa %xmm10, %xmm2 # 2 : 1/iak
543 pshufb %xmm3, %xmm2 # 2 = 1/iak
544 pxor %xmm0, %xmm2 # 2 = io
545 movdqa %xmm10, %xmm3 # 3 : 1/jak
546 pshufb %xmm4, %xmm3 # 3 = 1/jak
547 pxor %xmm1, %xmm3 # 3 = jo
548 movdqa %xmm13, %xmm4 # 4 : sbou
549 pshufb %xmm2, %xmm4 # 4 = sbou
550 movdqa %xmm12, %xmm0 # 0 : sbot
551 pshufb %xmm3, %xmm0 # 0 = sb1t
552 pxor %xmm4, %xmm0 # 0 = sbox output
554 # add in smeared stuff
558 .size _vpaes_schedule_round,.-_vpaes_schedule_round
561 ## .aes_schedule_transform
563 ## Linear-transform %xmm0 according to tables at (%r11)
565 ## Requires that %xmm9 = 0x0F0F... as in preheat
567 ## Clobbers %xmm1, %xmm2
569 .type _vpaes_schedule_transform,\@abi-omnipotent
571 _vpaes_schedule_transform:
576 movdqa (%r11), %xmm2 # lo
578 movdqa 16(%r11), %xmm0 # hi
582 .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
585 ## .aes_schedule_mangle
587 ## Mangle xmm0 from (basis-transformed) standard version
592 ## multiply by circulant 0,1,1,1
593 ## apply shiftrows transform
597 ## multiply by "inverse mixcolumns" circulant E,B,D,9
599 ## apply shiftrows transform
602 ## Writes out to (%rdx), and increments or decrements it
603 ## Keeps track of round number mod 4 in %r8
605 ## Clobbers xmm1-xmm5
607 .type _vpaes_schedule_mangle,\@abi-omnipotent
609 _vpaes_schedule_mangle:
610 movdqa %xmm0, %xmm4 # save xmm0 for later
611 movdqa .Lk_mc_forward(%rip),%xmm5
613 jnz .Lschedule_mangle_dec
617 pxor .Lk_s63(%rip),%xmm4
625 jmp .Lschedule_mangle_both
627 .Lschedule_mangle_dec:
628 # inverse mix columns
629 lea .Lk_dksd(%rip),%r11
632 psrld \$4, %xmm1 # 1 = hi
633 pand %xmm9, %xmm4 # 4 = lo
635 movdqa 0x00(%r11), %xmm2
637 movdqa 0x10(%r11), %xmm3
642 movdqa 0x20(%r11), %xmm2
645 movdqa 0x30(%r11), %xmm3
650 movdqa 0x40(%r11), %xmm2
653 movdqa 0x50(%r11), %xmm3
658 movdqa 0x60(%r11), %xmm2
661 movdqa 0x70(%r11), %xmm3
667 .Lschedule_mangle_both:
668 movdqa (%r8,%r10),%xmm1
674 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
677 # Interface to OpenSSL
679 .globl ${PREFIX}_set_encrypt_key
680 .type ${PREFIX}_set_encrypt_key,\@function,3
682 ${PREFIX}_set_encrypt_key:
684 $code.=<<___ if ($win64);
686 movaps %xmm6,0x10(%rsp)
687 movaps %xmm7,0x20(%rsp)
688 movaps %xmm8,0x30(%rsp)
689 movaps %xmm9,0x40(%rsp)
690 movaps %xmm10,0x50(%rsp)
691 movaps %xmm11,0x60(%rsp)
692 movaps %xmm12,0x70(%rsp)
693 movaps %xmm13,0x80(%rsp)
694 movaps %xmm14,0x90(%rsp)
695 movaps %xmm15,0xa0(%rsp)
702 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
706 call _vpaes_schedule_core
708 $code.=<<___ if ($win64);
709 movaps 0x10(%rsp),%xmm6
710 movaps 0x20(%rsp),%xmm7
711 movaps 0x30(%rsp),%xmm8
712 movaps 0x40(%rsp),%xmm9
713 movaps 0x50(%rsp),%xmm10
714 movaps 0x60(%rsp),%xmm11
715 movaps 0x70(%rsp),%xmm12
716 movaps 0x80(%rsp),%xmm13
717 movaps 0x90(%rsp),%xmm14
718 movaps 0xa0(%rsp),%xmm15
725 .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
727 .globl ${PREFIX}_set_decrypt_key
728 .type ${PREFIX}_set_decrypt_key,\@function,3
730 ${PREFIX}_set_decrypt_key:
732 $code.=<<___ if ($win64);
734 movaps %xmm6,0x10(%rsp)
735 movaps %xmm7,0x20(%rsp)
736 movaps %xmm8,0x30(%rsp)
737 movaps %xmm9,0x40(%rsp)
738 movaps %xmm10,0x50(%rsp)
739 movaps %xmm11,0x60(%rsp)
740 movaps %xmm12,0x70(%rsp)
741 movaps %xmm13,0x80(%rsp)
742 movaps %xmm14,0x90(%rsp)
743 movaps %xmm15,0xa0(%rsp)
750 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
752 lea 16(%rdx,%rax),%rdx
758 xor \$32,%r8d # nbits==192?0:32
759 call _vpaes_schedule_core
761 $code.=<<___ if ($win64);
762 movaps 0x10(%rsp),%xmm6
763 movaps 0x20(%rsp),%xmm7
764 movaps 0x30(%rsp),%xmm8
765 movaps 0x40(%rsp),%xmm9
766 movaps 0x50(%rsp),%xmm10
767 movaps 0x60(%rsp),%xmm11
768 movaps 0x70(%rsp),%xmm12
769 movaps 0x80(%rsp),%xmm13
770 movaps 0x90(%rsp),%xmm14
771 movaps 0xa0(%rsp),%xmm15
778 .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
780 .globl ${PREFIX}_encrypt
781 .type ${PREFIX}_encrypt,\@function,3
785 $code.=<<___ if ($win64);
787 movaps %xmm6,0x10(%rsp)
788 movaps %xmm7,0x20(%rsp)
789 movaps %xmm8,0x30(%rsp)
790 movaps %xmm9,0x40(%rsp)
791 movaps %xmm10,0x50(%rsp)
792 movaps %xmm11,0x60(%rsp)
793 movaps %xmm12,0x70(%rsp)
794 movaps %xmm13,0x80(%rsp)
795 movaps %xmm14,0x90(%rsp)
796 movaps %xmm15,0xa0(%rsp)
802 call _vpaes_encrypt_core
805 $code.=<<___ if ($win64);
806 movaps 0x10(%rsp),%xmm6
807 movaps 0x20(%rsp),%xmm7
808 movaps 0x30(%rsp),%xmm8
809 movaps 0x40(%rsp),%xmm9
810 movaps 0x50(%rsp),%xmm10
811 movaps 0x60(%rsp),%xmm11
812 movaps 0x70(%rsp),%xmm12
813 movaps 0x80(%rsp),%xmm13
814 movaps 0x90(%rsp),%xmm14
815 movaps 0xa0(%rsp),%xmm15
821 .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
823 .globl ${PREFIX}_decrypt
824 .type ${PREFIX}_decrypt,\@function,3
828 $code.=<<___ if ($win64);
830 movaps %xmm6,0x10(%rsp)
831 movaps %xmm7,0x20(%rsp)
832 movaps %xmm8,0x30(%rsp)
833 movaps %xmm9,0x40(%rsp)
834 movaps %xmm10,0x50(%rsp)
835 movaps %xmm11,0x60(%rsp)
836 movaps %xmm12,0x70(%rsp)
837 movaps %xmm13,0x80(%rsp)
838 movaps %xmm14,0x90(%rsp)
839 movaps %xmm15,0xa0(%rsp)
845 call _vpaes_decrypt_core
848 $code.=<<___ if ($win64);
849 movaps 0x10(%rsp),%xmm6
850 movaps 0x20(%rsp),%xmm7
851 movaps 0x30(%rsp),%xmm8
852 movaps 0x40(%rsp),%xmm9
853 movaps 0x50(%rsp),%xmm10
854 movaps 0x60(%rsp),%xmm11
855 movaps 0x70(%rsp),%xmm12
856 movaps 0x80(%rsp),%xmm13
857 movaps 0x90(%rsp),%xmm14
858 movaps 0xa0(%rsp),%xmm15
864 .size ${PREFIX}_decrypt,.-${PREFIX}_decrypt
867 my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
868 # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
869 # size_t length, const AES_KEY *key,
870 # unsigned char *ivp,const int enc);
872 .globl ${PREFIX}_cbc_encrypt
873 .type ${PREFIX}_cbc_encrypt,\@function,6
875 ${PREFIX}_cbc_encrypt:
878 ($len,$key)=($key,$len);
883 $code.=<<___ if ($win64);
885 movaps %xmm6,0x10(%rsp)
886 movaps %xmm7,0x20(%rsp)
887 movaps %xmm8,0x30(%rsp)
888 movaps %xmm9,0x40(%rsp)
889 movaps %xmm10,0x50(%rsp)
890 movaps %xmm11,0x60(%rsp)
891 movaps %xmm12,0x70(%rsp)
892 movaps %xmm13,0x80(%rsp)
893 movaps %xmm14,0x90(%rsp)
894 movaps %xmm15,0xa0(%rsp)
898 movdqu ($ivp),%xmm6 # load IV
908 call _vpaes_encrypt_core
910 movdqu %xmm0,($out,$inp)
919 call _vpaes_decrypt_core
922 movdqu %xmm0,($out,$inp)
927 movdqu %xmm6,($ivp) # save IV
929 $code.=<<___ if ($win64);
930 movaps 0x10(%rsp),%xmm6
931 movaps 0x20(%rsp),%xmm7
932 movaps 0x30(%rsp),%xmm8
933 movaps 0x40(%rsp),%xmm9
934 movaps 0x50(%rsp),%xmm10
935 movaps 0x60(%rsp),%xmm11
936 movaps 0x70(%rsp),%xmm12
937 movaps 0x80(%rsp),%xmm13
938 movaps 0x90(%rsp),%xmm14
939 movaps 0xa0(%rsp),%xmm15
946 .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
953 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
954 ## and %xmm9-%xmm15 as specified below.
956 .type _vpaes_preheat,\@abi-omnipotent
959 lea .Lk_s0F(%rip), %r10
960 movdqa -0x20(%r10), %xmm10 # .Lk_inv
961 movdqa -0x10(%r10), %xmm11 # .Lk_inv+16
962 movdqa 0x00(%r10), %xmm9 # .Lk_s0F
963 movdqa 0x30(%r10), %xmm13 # .Lk_sb1
964 movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16
965 movdqa 0x50(%r10), %xmm15 # .Lk_sb2
966 movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16
968 .size _vpaes_preheat,.-_vpaes_preheat
969 ########################################################
973 ########################################################
974 .type _vpaes_consts,\@object
978 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
979 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
982 .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
984 .Lk_ipt: # input transform (lo, hi)
985 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
986 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
988 .Lk_sb1: # sb1u, sb1t
989 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
990 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
991 .Lk_sb2: # sb2u, sb2t
992 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
993 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
994 .Lk_sbo: # sbou, sbot
995 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
996 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
998 .Lk_mc_forward: # mc_forward
999 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
1000 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
1001 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
1002 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
1004 .Lk_mc_backward:# mc_backward
1005 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
1006 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
1007 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
1008 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
1011 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
1012 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
1013 .quad 0x0F060D040B020900, 0x070E050C030A0108
1014 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
1017 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
1019 .Lk_s63: # s63: all equal to 0x63 transformed
1020 .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
1022 .Lk_opt: # output transform
1023 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
1024 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
1026 .Lk_deskew: # deskew tables: inverts the sbox's "skew"
1027 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
1028 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
1032 ## Key schedule constants
1034 .Lk_dksd: # decryption key schedule: invskew x*D
1035 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
1036 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
1037 .Lk_dksb: # decryption key schedule: invskew x*B
1038 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
1039 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
1040 .Lk_dkse: # decryption key schedule: invskew x*E + 0x63
1041 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
1042 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
1043 .Lk_dks9: # decryption key schedule: invskew x*9
1044 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
1045 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
1049 ## Round function constants
1051 .Lk_dipt: # decryption input transform
1052 .quad 0x0F505B040B545F00, 0x154A411E114E451A
1053 .quad 0x86E383E660056500, 0x12771772F491F194
1055 .Lk_dsb9: # decryption sbox output *9*u, *9*t
1056 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
1057 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
1058 .Lk_dsbd: # decryption sbox output *D*u, *D*t
1059 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
1060 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
1061 .Lk_dsbb: # decryption sbox output *B*u, *B*t
1062 .quad 0xD022649296B44200, 0x602646F6B0F2D404
1063 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
1064 .Lk_dsbe: # decryption sbox output *E*u, *E*t
1065 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
1066 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
1067 .Lk_dsbo: # decryption sbox final output
1068 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
1069 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
1070 .asciz "Vector Permutation AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
1072 .size _vpaes_consts,.-_vpaes_consts
1076 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1077 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1084 .extern __imp_RtlVirtualUnwind
1085 .type se_handler,\@abi-omnipotent
1099 mov 120($context),%rax # pull context->Rax
1100 mov 248($context),%rbx # pull context->Rip
1102 mov 8($disp),%rsi # disp->ImageBase
1103 mov 56($disp),%r11 # disp->HandlerData
1105 mov 0(%r11),%r10d # HandlerData[0]
1106 lea (%rsi,%r10),%r10 # prologue label
1107 cmp %r10,%rbx # context->Rip<prologue label
1110 mov 152($context),%rax # pull context->Rsp
1112 mov 4(%r11),%r10d # HandlerData[1]
1113 lea (%rsi,%r10),%r10 # epilogue label
1114 cmp %r10,%rbx # context->Rip>=epilogue label
1117 lea 16(%rax),%rsi # %xmm save area
1118 lea 512($context),%rdi # &context.Xmm6
1119 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1120 .long 0xa548f3fc # cld; rep movsq
1121 lea 0xb8(%rax),%rax # adjust stack pointer
1126 mov %rax,152($context) # restore context->Rsp
1127 mov %rsi,168($context) # restore context->Rsi
1128 mov %rdi,176($context) # restore context->Rdi
1130 mov 40($disp),%rdi # disp->ContextRecord
1131 mov $context,%rsi # context
1132 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1133 .long 0xa548f3fc # cld; rep movsq
1136 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1137 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1138 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1139 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1140 mov 40(%rsi),%r10 # disp->ContextRecord
1141 lea 56(%rsi),%r11 # &disp->HandlerData
1142 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1143 mov %r10,32(%rsp) # arg5
1144 mov %r11,40(%rsp) # arg6
1145 mov %r12,48(%rsp) # arg7
1146 mov %rcx,56(%rsp) # arg8, (NULL)
1147 call *__imp_RtlVirtualUnwind(%rip)
1149 mov \$1,%eax # ExceptionContinueSearch
1161 .size se_handler,.-se_handler
1165 .rva .LSEH_begin_${PREFIX}_set_encrypt_key
1166 .rva .LSEH_end_${PREFIX}_set_encrypt_key
1167 .rva .LSEH_info_${PREFIX}_set_encrypt_key
1169 .rva .LSEH_begin_${PREFIX}_set_decrypt_key
1170 .rva .LSEH_end_${PREFIX}_set_decrypt_key
1171 .rva .LSEH_info_${PREFIX}_set_decrypt_key
1173 .rva .LSEH_begin_${PREFIX}_encrypt
1174 .rva .LSEH_end_${PREFIX}_encrypt
1175 .rva .LSEH_info_${PREFIX}_encrypt
1177 .rva .LSEH_begin_${PREFIX}_decrypt
1178 .rva .LSEH_end_${PREFIX}_decrypt
1179 .rva .LSEH_info_${PREFIX}_decrypt
1181 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
1182 .rva .LSEH_end_${PREFIX}_cbc_encrypt
1183 .rva .LSEH_info_${PREFIX}_cbc_encrypt
1187 .LSEH_info_${PREFIX}_set_encrypt_key:
1190 .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[]
1191 .LSEH_info_${PREFIX}_set_decrypt_key:
1194 .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[]
1195 .LSEH_info_${PREFIX}_encrypt:
1198 .rva .Lenc_body,.Lenc_epilogue # HandlerData[]
1199 .LSEH_info_${PREFIX}_decrypt:
1202 .rva .Ldec_body,.Ldec_epilogue # HandlerData[]
1203 .LSEH_info_${PREFIX}_cbc_encrypt:
1206 .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[]
1210 $code =~ s/\`([^\`]*)\`/eval($1)/gem;