2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ######################################################################
11 ## Constant-time SSSE3 AES core implementation.
14 ## By Mike Hamburg (Stanford University), 2009
17 ## For details see http://shiftleft.org/papers/vector_aes/ and
18 ## http://crypto.stanford.edu/vpaes/.
20 # CBC encrypt/decrypt performance in cycles per byte processed with
24 # G4e 35.5/52.1/(23.8) 11.9(*)/15.4
25 # POWER6 42.7/54.3/(28.2) 63.0/92.8(**)
26 # POWER7 32.3/42.9/(18.4) 18.5/23.3
28 # (*) This is ~10% worse than reported in paper. The reason is
29 # twofold. This module doesn't make any assumption about
30 # key schedule (or data for that matter) alignment and handles
31 # it in-line. Secondly it, being transliterated from
32 # vpaes-x86_64.pl, relies on "nested inversion" better suited
34 # (**) Inadequate POWER6 performance is due to astronomic AltiVec
35 # latency, 9 cycles per simple logical operation.
39 if ($flavour =~ /64/) {
46 } elsif ($flavour =~ /32/) {
53 } else { die "nonsense $flavour"; }
56 $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
58 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
60 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
61 die "can't locate ppc-xlate.pl";
63 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
70 .align 7 # totally strategic alignment
72 Lk_mc_forward: # mc_forward
73 .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv
74 .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv
75 .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv
76 .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv
77 Lk_mc_backward: # mc_backward
78 .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv
79 .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv
80 .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv
81 .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv
83 .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv
84 .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv
85 .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv
86 .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv
92 .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev
93 .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev
94 Lk_ipt: # input transform (lo, hi)
95 .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev
96 .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev
98 .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev
99 .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev
101 .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev
102 .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev
104 .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev
105 .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev
110 Lk_dipt: # decryption input transform
111 .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev
112 .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev
113 Lk_dsbo: # decryption sbox final output
114 .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev
115 .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev
116 Lk_dsb9: # decryption sbox output *9*u, *9*t
117 .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev
118 .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev
119 Lk_dsbd: # decryption sbox output *D*u, *D*t
120 .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev
121 .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev
122 Lk_dsbb: # decryption sbox output *B*u, *B*t
123 .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev
124 .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev
125 Lk_dsbe: # decryption sbox output *E*u, *E*t
126 .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev
127 .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev
130 ## Key schedule constants
132 Lk_dksd: # decryption key schedule: invskew x*D
133 .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev
134 .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev
135 Lk_dksb: # decryption key schedule: invskew x*B
136 .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev
137 .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev
138 Lk_dkse: # decryption key schedule: invskew x*E + 0x63
139 .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev
140 .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev
141 Lk_dks9: # decryption key schedule: invskew x*9
142 .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev
143 .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev
146 .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis
148 .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis
150 Lk_opt: # output transform
151 .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev
152 .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev
153 Lk_deskew: # deskew tables: inverts the sbox's "skew"
154 .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev
155 .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev
160 mflr r12 #vvvvv "distance between . and _vpaes_consts
165 .byte 0,12,0x14,0,0,0,0,0
166 .asciz "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)"
170 my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31));
172 my ($inp,$out,$key) = map("r$_",(3..5));
174 my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15));
175 my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19));
176 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23));
182 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
183 ## and %xmm9-%xmm15 as specified below.
186 _vpaes_encrypt_preheat:
190 li r11, 0xc0 # Lk_inv
194 vxor v7, v7, v7 # 0x00..00
195 vspltisb v8,4 # 0x04..04
196 vspltisb v9,0x0f # 0x0f..0f
215 .byte 0,12,0x14,0,0,0,0,0
220 ## AES-encrypt %xmm0.
224 ## %xmm9-%xmm15 as in _vpaes_preheat
225 ## (%rdx) = scheduled keys
228 ## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax
233 lwz r8, 240($key) # pull rounds
235 lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key
239 ?vperm v5, v5, v6, $keyperm # align round key
241 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
242 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1
243 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2
244 vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0
245 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
251 # middle of middle round
252 vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
253 lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
255 vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
256 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
257 andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4
258 vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
259 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
260 vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
261 lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
263 vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
264 vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
265 vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
266 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
267 vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
268 vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
269 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
273 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
274 vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
275 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
276 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
277 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
279 vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
280 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
281 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
283 lvx v6, r9, $key # vmovdqu (%r9), %xmm5
284 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
286 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
287 ?vperm v5, v5, v6, $keyperm # align round key
288 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
291 # middle of last round
293 # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
294 # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
295 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
296 lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
297 vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
298 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
299 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
300 vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0
303 .byte 0,12,0x14,0,0,0,0,0
305 .globl .vpaes_encrypt
308 $STU $sp,-$FRAME($sp)
309 li r10,`15+6*$SIZE_T`
310 li r11,`31+6*$SIZE_T`
312 mfspr r7, 256 # save vrsave
335 stw r7,`$FRAME-4`($sp) # save vrsave
337 $PUSH r6,`$FRAME+$LRSAVE`($sp)
338 mtspr 256, r0 # preserve all AltiVec registers
340 bl _vpaes_encrypt_preheat
342 ?lvsl $inpperm, 0, $inp # prepare for unaligned access
344 addi $inp, $inp, 15 # 15 is not a typo
345 ?lvsr $outperm, 0, $out
346 ?lvsl $keyperm, 0, $key # prepare for unaligned access
347 lvx $inptail, 0, $inp # redundant in aligned case
348 ?vperm v0, v0, $inptail, $inpperm
350 bl _vpaes_encrypt_core
356 vperm v0, v0, v0, $outperm # rotate right/left
361 bdnz Lenc_out_unaligned
369 li r10,`15+6*$SIZE_T`
370 li r11,`31+6*$SIZE_T`
372 mtspr 256, r7 # restore vrsave
398 .byte 0,12,0x04,1,0x80,0,3,0
400 .size .vpaes_encrypt,.-.vpaes_encrypt
403 _vpaes_decrypt_preheat:
407 li r11, 0xc0 # Lk_inv
411 vxor v7, v7, v7 # 0x00..00
412 vspltisb v8,4 # 0x04..04
413 vspltisb v9,0x0f # 0x0f..0f
440 .byte 0,12,0x14,0,0,0,0,0
445 ## Same API as encryption core.
449 lwz r8, 240($key) # pull rounds
451 lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key
455 ?vperm v5, v5, v6, $keyperm # align round key
456 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
457 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
458 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0
459 vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2
460 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
467 # Inverse mix columns
469 lvx v0, r12, r11 # v5 and v0 are flipped
470 # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
471 # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
472 vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
474 vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
476 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0
477 # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
478 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
479 # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
481 vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
482 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
483 vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
484 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
485 # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
486 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
487 # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
489 vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
490 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
491 vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
492 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
493 # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
494 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
495 # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
497 vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
498 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
499 vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
500 vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
501 vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
505 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
506 vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
507 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
508 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
509 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
511 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
512 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
513 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
515 lvx v6, r9, $key # vmovdqu (%r9), %xmm0
516 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
518 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
519 ?vperm v5, v5, v6, $keyperm # align round key
520 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
523 # middle of last round
525 # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
526 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
527 # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
528 lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
529 vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
530 vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
531 vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A
532 vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0
535 .byte 0,12,0x14,0,0,0,0,0
537 .globl .vpaes_decrypt
540 $STU $sp,-$FRAME($sp)
541 li r10,`15+6*$SIZE_T`
542 li r11,`31+6*$SIZE_T`
544 mfspr r7, 256 # save vrsave
567 stw r7,`$FRAME-4`($sp) # save vrsave
569 $PUSH r6,`$FRAME+$LRSAVE`($sp)
570 mtspr 256, r0 # preserve all AltiVec registers
572 bl _vpaes_decrypt_preheat
574 ?lvsl $inpperm, 0, $inp # prepare for unaligned access
576 addi $inp, $inp, 15 # 15 is not a typo
577 ?lvsr $outperm, 0, $out
578 ?lvsl $keyperm, 0, $key
579 lvx $inptail, 0, $inp # redundant in aligned case
580 ?vperm v0, v0, $inptail, $inpperm
582 bl _vpaes_decrypt_core
588 vperm v0, v0, v0, $outperm # rotate right/left
593 bdnz Ldec_out_unaligned
601 li r10,`15+6*$SIZE_T`
602 li r11,`31+6*$SIZE_T`
604 mtspr 256, r7 # restore vrsave
630 .byte 0,12,0x04,1,0x80,0,3,0
632 .size .vpaes_decrypt,.-.vpaes_decrypt
634 .globl .vpaes_cbc_encrypt
640 $STU $sp,-`($FRAME+2*$SIZE_T)`($sp)
642 li r10,`15+6*$SIZE_T`
643 li r11,`31+6*$SIZE_T`
667 stw r12,`$FRAME-4`($sp) # save vrsave
668 $PUSH r30,`$FRAME+$SIZE_T*0`($sp)
669 $PUSH r31,`$FRAME+$SIZE_T*1`($sp)
671 $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
673 and r30, r5, r9 # copy length&-16
674 andi. r9, $out, 15 # is $out aligned?
675 mr r5, r6 # copy pointer to key
676 mr r31, r7 # copy pointer to iv
678 mcrf cr1, cr0 # put aside $out alignment flag
679 mr r7, r12 # copy vrsave
680 mtspr 256, r6 # preserve all AltiVec registers
682 lvx v24, 0, r31 # load [potentially unaligned] iv
684 ?lvsl $inpperm, 0, r31
686 ?vperm v24, v24, v25, $inpperm
688 cmpwi r8, 0 # test direction
689 neg r8, $inp # prepare for unaligned access
691 ?lvsl $keyperm, 0, $key
692 ?lvsr $outperm, 0, $out
693 ?lvsr $inpperm, 0, r8 # -$inp
694 vnor $outmask, v7, v7 # 0xff..ff
695 lvx $inptail, 0, $inp
696 ?vperm $outmask, v7, $outmask, $outperm
697 addi $inp, $inp, 15 # 15 is not a typo
701 bl _vpaes_encrypt_preheat
704 beq cr1, Lcbc_enc_loop # $out is aligned
707 lvx $inptail, 0, $inp
709 ?vperm v0, v0, $inptail, $inpperm
710 vxor v0, v0, v24 # ^= iv
712 bl _vpaes_encrypt_core
715 vmr v24, v0 # put aside iv
717 vperm $outhead, v0, v0, $outperm # rotate right/left
720 stvebx $outhead, r8, r9
725 sub. r30, r30, r0 # len -= 16
727 beq Lcbc_unaligned_done
731 lvx $inptail, 0, $inp
733 ?vperm v0, v0, $inptail, $inpperm
734 vxor v0, v0, v24 # ^= iv
736 bl _vpaes_encrypt_core
738 vmr v24, v0 # put aside iv
739 sub. r30, r30, r0 # len -= 16
740 vperm v0, v0, v0, $outperm # rotate right/left
741 vsel v1, $outhead, v0, $outmask
751 bl _vpaes_decrypt_preheat
754 beq cr1, Lcbc_dec_loop # $out is aligned
757 lvx $inptail, 0, $inp
759 ?vperm v0, v0, $inptail, $inpperm
760 vmr v25, v0 # put aside input
762 bl _vpaes_decrypt_core
765 vxor v0, v0, v24 # ^= iv
768 vperm $outhead, v0, v0, $outperm # rotate right/left
771 stvebx $outhead, r8, r9
776 sub. r30, r30, r0 # len -= 16
778 beq Lcbc_unaligned_done
782 lvx $inptail, 0, $inp
784 ?vperm v0, v0, $inptail, $inpperm
785 vmr v25, v0 # put aside input
787 bl _vpaes_decrypt_core
789 vxor v0, v0, v24 # ^= iv
791 sub. r30, r30, r0 # len -= 16
792 vperm v0, v0, v0, $outperm # rotate right/left
793 vsel v1, $outhead, v0, $outmask
800 beq cr1, Lcbc_write_iv # $out is aligned
807 stvebx $outhead, r9, $out
813 neg r8, r31 # write [potentially unaligned] iv
815 ?lvsl $outperm, 0, r8
818 vperm v24, v24, v24, $outperm # rotate right/left
819 stvewx v24, 0, r31 # ivp is at least 32-bit aligned
824 mtspr 256, r7 # restore vrsave
825 li r10,`15+6*$SIZE_T`
826 li r11,`31+6*$SIZE_T`
850 $POP r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
851 $POP r30,`$FRAME+$SIZE_T*0`($sp)
852 $POP r31,`$FRAME+$SIZE_T*1`($sp)
854 addi $sp,$sp,`$FRAME+$SIZE_T*2`
857 .byte 0,12,0x04,1,0x80,2,6,0
859 .size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt
863 my ($inp,$bits,$out)=map("r$_",(3..5));
865 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24));
868 ########################################################
870 ## AES key schedule ##
872 ########################################################
878 li r11, 0xc0 # Lk_inv
883 vspltisb v8,4 # 0x04..04
884 vxor v9,v9,v9 # 0x00..00
885 lvx $invlo, r12, r11 # Lk_inv
889 lvx $iptlo, r12, r9 # Lk_ipt
894 lvx v14, r12, r11 # Lk_sb1
899 lvx v16, r12, r9 # Lk_dksd
903 lvx v18, r12, r11 # Lk_dksb
907 lvx v20, r12, r9 # Lk_dkse
911 lvx v22, r12, r11 # Lk_dks9
914 lvx v24, r12, r9 # Lk_rcon
915 lvx v25, 0, r12 # Lk_mc_forward[0]
916 lvx v26, r12, r8 # Lks63
919 .byte 0,12,0x14,0,0,0,0,0
922 _vpaes_schedule_core:
925 bl _vpaes_key_preheat # load the tables
927 #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned)
928 neg r8, $inp # prepare for unaligned access
930 addi $inp, $inp, 15 # 15 is not typo
931 ?lvsr $inpperm, 0, r8 # -$inp
932 lvx v6, 0, $inp # v6 serves as inptail
934 ?vperm v0, v0, v6, $inpperm
937 vmr v3, v0 # vmovdqa %xmm0, %xmm3
938 bl _vpaes_schedule_transform
939 vmr v7, v0 # vmovdqa %xmm0, %xmm7
941 bne $dir, Lschedule_am_decrypting
943 # encrypting, output zeroth round key after transform
944 li r8, 0x30 # mov \$0x30,%r8d
949 ?lvsr $outperm, 0, $out # prepare for unaligned access
950 vnor $outmask, v9, v9 # 0xff..ff
951 ?vperm $outmask, v9, $outmask, $outperm
953 #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
954 vperm $outhead, v0, v0, $outperm # rotate right/left
955 stvewx $outhead, 0, $out # some are superfluous
956 stvewx $outhead, r9, $out
957 stvewx $outhead, r10, $out
958 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
959 stvewx $outhead, r11, $out
962 Lschedule_am_decrypting:
963 srwi r8, $bits, 1 # shr \$1,%r8d
964 andi. r8, r8, 32 # and \$32,%r8d
965 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
966 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
967 # decrypting, output zeroth round key after shiftrows
968 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
972 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
974 neg r0, $out # prepare for unaligned access
975 ?lvsl $outperm, 0, r0
976 vnor $outmask, v9, v9 # 0xff..ff
977 ?vperm $outmask, $outmask, v9, $outperm
979 #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
980 vperm $outhead, v4, v4, $outperm # rotate right/left
981 stvewx $outhead, 0, $out # some are superfluous
982 stvewx $outhead, r9, $out
983 stvewx $outhead, r10, $out
984 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
985 stvewx $outhead, r11, $out
986 addi $out, $out, 15 # 15 is not typo
987 xori r8, r8, 0x30 # xor \$0x30, %r8
990 cmplwi $bits, 192 # cmp \$192, %esi
998 ## 128-bit specific part of key schedule.
1000 ## This schedule is really simple, because all its parts
1001 ## are accomplished by the subroutines.
1004 li r0, 10 # mov \$10, %esi
1008 bl _vpaes_schedule_round
1009 bdz Lschedule_mangle_last # dec %esi
1010 bl _vpaes_schedule_mangle # write output
1014 ## .aes_schedule_192
1016 ## 192-bit specific part of key schedule.
1018 ## The main body of this schedule is the same as the 128-bit
1019 ## schedule, but with more smearing. The long, high side is
1020 ## stored in %xmm7 as before, and the short, low side is in
1021 ## the high bits of %xmm6.
1023 ## This schedule is somewhat nastier, however, because each
1024 ## round produces 192 bits of key material, or 1.5 round keys.
1025 ## Therefore, on each cycle we do 2 rounds and produce 3 round
1030 li r0, 4 # mov \$4, %esi
1032 ?vperm v0, v6, v0, $inpperm
1033 ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
1034 bl _vpaes_schedule_transform # input transform
1035 ?vsldoi v6, v0, v9, 8
1036 ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros
1040 bl _vpaes_schedule_round
1041 ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0
1042 bl _vpaes_schedule_mangle # save key n
1043 bl _vpaes_schedule_192_smear
1044 bl _vpaes_schedule_mangle # save key n+1
1045 bl _vpaes_schedule_round
1046 bdz Lschedule_mangle_last # dec %esi
1047 bl _vpaes_schedule_mangle # save key n+2
1048 bl _vpaes_schedule_192_smear
1052 ## .aes_schedule_256
1054 ## 256-bit specific part of key schedule.
1056 ## The structure here is very similar to the 128-bit
1057 ## schedule, but with an additional "low side" in
1058 ## %xmm6. The low side's rounds are the same as the
1059 ## high side's, except no rcon and no rotation.
1063 li r0, 7 # mov \$7, %esi
1065 lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
1066 ?vperm v0, v6, v0, $inpperm
1067 bl _vpaes_schedule_transform # input transform
1071 bl _vpaes_schedule_mangle # output low result
1072 vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
1075 bl _vpaes_schedule_round
1076 bdz Lschedule_mangle_last # dec %esi
1077 bl _vpaes_schedule_mangle
1079 # low round. swap xmm7 and xmm6
1080 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
1081 vmr v5, v7 # vmovdqa %xmm7, %xmm5
1082 vmr v7, v6 # vmovdqa %xmm6, %xmm7
1083 bl _vpaes_schedule_low_round
1084 vmr v7, v5 # vmovdqa %xmm5, %xmm7
1088 ## .aes_schedule_mangle_last
1090 ## Mangler for last round of key schedule
1092 ## when encrypting, outputs out(%xmm0) ^ 63
1093 ## when decrypting, outputs unskew(%xmm0)
1095 ## Always called right before return... jumps to cleanup and exits
1098 Lschedule_mangle_last:
1099 # schedule last round key from xmm0
1100 li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11
1102 bne $dir, Lschedule_mangle_last_dec
1105 lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1
1106 li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform
1107 li r9, 0x2d0 # prepare to output transform
1108 vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute
1110 lvx $iptlo, r11, r12 # reload $ipt
1112 addi $out, $out, 16 # add \$16, %rdx
1113 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1114 bl _vpaes_schedule_transform # output transform
1116 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
1117 vperm v0, v0, v0, $outperm # rotate right/left
1119 vsel v2, $outhead, v0, $outmask
1123 stvewx v0, 0, $out # some (or all) are redundant
1124 stvewx v0, r10, $out
1125 stvewx v0, r11, $out
1126 stvewx v0, r12, $out
1127 b Lschedule_mangle_done
1130 Lschedule_mangle_last_dec:
1131 lvx $iptlo, r11, r12 # reload $ipt
1133 addi $out, $out, -16 # add \$-16, %rdx
1134 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1135 bl _vpaes_schedule_transform # output transform
1137 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
1138 addi r9, $out, -15 # -15 is not typo
1139 vperm v0, v0, v0, $outperm # rotate right/left
1141 vsel v2, $outhead, v0, $outmask
1145 stvewx v0, 0, r9 # some (or all) are redundant
1151 Lschedule_mangle_done:
1154 vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0
1155 vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1
1156 vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2
1157 vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3
1158 vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1159 vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5
1160 vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6
1161 vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7
1165 .byte 0,12,0x14,0,0,0,0,0
1168 ## .aes_schedule_192_smear
1170 ## Smear the short, low side in the 192-bit key schedule.
1173 ## %xmm7: high side, b a x y
1174 ## %xmm6: low side, d c 0 0
1178 ## %xmm6: b+c+d b+c 0 0
1179 ## %xmm0: b+c+d b+c b a
1182 _vpaes_schedule_192_smear:
1184 ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
1185 ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
1186 vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
1187 vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
1189 ?vsldoi v6, v6, v9, 8
1190 ?vsldoi v6, v9, v6, 8 # clobber low side with zeros
1193 .byte 0,12,0x14,0,0,0,0,0
1196 ## .aes_schedule_round
1198 ## Runs one main round of the key schedule on %xmm0, %xmm7
1200 ## Specifically, runs subbytes on the high dword of %xmm0
1201 ## then rotates it by one byte and xors into the low dword of
1204 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
1207 ## Smears the dwords of %xmm7 by xoring the low into the
1208 ## second low, result into third, result into highest.
1210 ## Returns results in %xmm7 = %xmm0.
1211 ## Clobbers %xmm1-%xmm4, %r11.
1214 _vpaes_schedule_round:
1215 # extract rcon from xmm8
1216 #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1217 ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1
1218 ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8
1219 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1222 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
1223 ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0
1227 # low round: same as high round, but no rotation and no rcon.
1228 _vpaes_schedule_low_round:
1230 ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1
1231 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1232 vspltisb v1, 0x0f # 0x0f..0f
1233 ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4
1236 vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k
1237 vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
1238 vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7
1239 vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
1240 vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
1241 vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
1242 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
1243 vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
1244 vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7
1245 vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
1246 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
1247 vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
1248 vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io
1249 vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
1250 vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
1251 vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
1252 vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
1254 # add in smeared stuff
1255 vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0
1256 vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7
1259 .byte 0,12,0x14,0,0,0,0,0
1262 ## .aes_schedule_transform
1264 ## Linear-transform %xmm0 according to tables at (%r11)
1266 ## Requires that %xmm9 = 0x0F0F... as in preheat
1271 _vpaes_schedule_transform:
1272 #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1
1273 vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
1274 # vmovdqa (%r11), %xmm2 # lo
1275 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
1276 # vmovdqa 16(%r11), %xmm1 # hi
1277 vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0
1278 vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0
1281 .byte 0,12,0x14,0,0,0,0,0
1284 ## .aes_schedule_mangle
1286 ## Mangle xmm0 from (basis-transformed) standard version
1291 ## multiply by circulant 0,1,1,1
1292 ## apply shiftrows transform
1296 ## multiply by "inverse mixcolumns" circulant E,B,D,9
1298 ## apply shiftrows transform
1301 ## Writes out to (%rdx), and increments or decrements it
1302 ## Keeps track of round number mod 4 in %r8
1304 ## Clobbers xmm1-xmm5
1307 _vpaes_schedule_mangle:
1308 #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later
1309 # vmovdqa .Lk_mc_forward(%rip),%xmm5
1310 bne $dir, Lschedule_mangle_dec
1313 vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4
1314 addi $out, $out, 16 # add \$16, %rdx
1315 vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4
1316 vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1
1317 vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3
1318 vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4
1319 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1320 vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3
1322 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1323 addi r8, r8, -16 # add \$-16, %r8
1324 andi. r8, r8, 0x30 # and \$0x30, %r8
1326 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1327 vperm v1, v3, v3, $outperm # rotate right/left
1328 vsel v2, $outhead, v1, $outmask
1334 Lschedule_mangle_dec:
1335 # inverse mix columns
1336 # lea .Lk_dksd(%rip),%r11
1337 vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
1338 #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo
1340 # vmovdqa 0x00(%r11), %xmm2
1341 vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2
1342 # vmovdqa 0x10(%r11), %xmm3
1343 vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3
1344 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1345 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1347 # vmovdqa 0x20(%r11), %xmm2
1348 vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2
1349 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1350 # vmovdqa 0x30(%r11), %xmm3
1351 vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3
1352 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1353 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1355 # vmovdqa 0x40(%r11), %xmm2
1356 vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2
1357 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1358 # vmovdqa 0x50(%r11), %xmm3
1359 vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3
1360 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1362 # vmovdqa 0x60(%r11), %xmm2
1363 vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2
1364 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1365 # vmovdqa 0x70(%r11), %xmm4
1366 vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4
1367 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1368 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1369 vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3
1371 addi $out, $out, -16 # add \$-16, %rdx
1373 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1374 addi r8, r8, -16 # add \$-16, %r8
1375 andi. r8, r8, 0x30 # and \$0x30, %r8
1377 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1378 vperm v1, v3, v3, $outperm # rotate right/left
1379 vsel v2, $outhead, v1, $outmask
1384 .byte 0,12,0x14,0,0,0,0,0
1386 .globl .vpaes_set_encrypt_key
1388 .vpaes_set_encrypt_key:
1389 $STU $sp,-$FRAME($sp)
1390 li r10,`15+6*$SIZE_T`
1391 li r11,`31+6*$SIZE_T`
1393 mfspr r6, 256 # save vrsave
1416 stw r6,`$FRAME-4`($sp) # save vrsave
1418 $PUSH r0, `$FRAME+$LRSAVE`($sp)
1419 mtspr 256, r7 # preserve all AltiVec registers
1421 srwi r9, $bits, 5 # shr \$5,%eax
1422 addi r9, r9, 6 # add \$5,%eax
1423 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1425 cmplw $dir, $bits, $bits # set encrypt direction
1426 li r8, 0x30 # mov \$0x30,%r8d
1427 bl _vpaes_schedule_core
1429 $POP r0, `$FRAME+$LRSAVE`($sp)
1430 li r10,`15+6*$SIZE_T`
1431 li r11,`31+6*$SIZE_T`
1432 mtspr 256, r6 # restore vrsave
1460 .byte 0,12,0x04,1,0x80,0,3,0
1462 .size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key
1464 .globl .vpaes_set_decrypt_key
1466 .vpaes_set_decrypt_key:
1467 $STU $sp,-$FRAME($sp)
1468 li r10,`15+6*$SIZE_T`
1469 li r11,`31+6*$SIZE_T`
1471 mfspr r6, 256 # save vrsave
1494 stw r6,`$FRAME-4`($sp) # save vrsave
1496 $PUSH r0, `$FRAME+$LRSAVE`($sp)
1497 mtspr 256, r7 # preserve all AltiVec registers
1499 srwi r9, $bits, 5 # shr \$5,%eax
1500 addi r9, r9, 6 # add \$5,%eax
1501 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1503 slwi r9, r9, 4 # shl \$4,%eax
1504 add $out, $out, r9 # lea (%rdx,%rax),%rdx
1506 cmplwi $dir, $bits, 0 # set decrypt direction
1507 srwi r8, $bits, 1 # shr \$1,%r8d
1508 andi. r8, r8, 32 # and \$32,%r8d
1509 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
1510 bl _vpaes_schedule_core
1512 $POP r0, `$FRAME+$LRSAVE`($sp)
1513 li r10,`15+6*$SIZE_T`
1514 li r11,`31+6*$SIZE_T`
1515 mtspr 256, r6 # restore vrsave
1543 .byte 0,12,0x04,1,0x80,0,3,0
1545 .size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key
1550 foreach (split("\n",$code)) {
1551 s/\`([^\`]*)\`/eval $1/geo;
1553 # constants table endian-specific conversion
1554 if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) {
1558 # convert to endian-agnostic format
1559 foreach (split(/,\s+/,$1)) {
1560 my $l = /^0/?oct:int;
1561 push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
1564 # little-endian conversion
1565 if ($flavour =~ /le$/o) {
1566 SWITCH: for($conv) {
1567 /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
1568 /\?rev/ && do { @bytes=reverse(@bytes); last; };
1573 print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
1576 $consts=0 if (m/Lconsts:/o); # end of table
1578 # instructions prefixed with '?' are endian-specific and need
1579 # to be adjusted accordingly...
1580 if ($flavour =~ /le$/o) { # little-endian
1583 s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
1584 s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
1585 s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
1586 } else { # big-endian