3 ######################################################################
4 ## Constant-time SSSE3 AES core implementation.
7 ## By Mike Hamburg (Stanford University), 2009
10 ## For details see http://shiftleft.org/papers/vector_aes/ and
11 ## http://crypto.stanford.edu/vpaes/.
13 # CBC encrypt/decrypt performance in cycles per byte processed with
17 # G4e 35.5/52.1/(23.8) 11.9(*)/15.4
18 # POWER6 42.7/54.3/(28.2) 63.0/92.8(**)
19 # POWER7 32.3/42.9/(18.4) 18.5/23.3
21 # (*) This is ~10% worse than reported in paper. The reason is
22 # twofold. This module doesn't make any assumption about
23 # key schedule (or data for that matter) alignment and handles
24 # it in-line. Secondly it, being transliterated from
25 # vpaes-x86_64.pl, relies on "nested inversion" better suited
27 # (**) Inadequate POWER6 performance is due to astronomic AltiVec
28 # latency, 9 cycles per simple logical operation.
32 if ($flavour =~ /64/) {
38 } elsif ($flavour =~ /32/) {
44 } else { die "nonsense $flavour"; }
47 $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
51 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
52 die "can't locate ppc-xlate.pl";
54 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
61 .align 7 # totally strategic alignment
63 Lk_mc_forward: # mc_forward
64 .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c
65 .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300
66 .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704
67 .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08
68 Lk_mc_backward: # mc_backward
69 .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e
70 .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a
71 .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506
72 .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102
74 .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
75 .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b
76 .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07
77 .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603
83 .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704
84 .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03
85 Lk_ipt: # input transform (lo, hi)
86 .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca
87 .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd
89 .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15
90 .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e
92 .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b
93 .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5
95 .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2
96 .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e
101 Lk_dipt: # decryption input transform
102 .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15
103 .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712
104 Lk_dsbo: # decryption sbox final output
105 .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7
106 .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca
107 Lk_dsb9: # decryption sbox output *9*u, *9*t
108 .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca
109 .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72
110 Lk_dsbd: # decryption sbox output *D*u, *D*t
111 .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5
112 .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129
113 Lk_dsbb: # decryption sbox output *B*u, *B*t
114 .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660
115 .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3
116 Lk_dsbe: # decryption sbox output *E*u, *E*t
117 .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222
118 .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794
121 ## Key schedule constants
123 Lk_dksd: # decryption key schedule: invskew x*D
124 .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007
125 .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f
126 Lk_dksb: # decryption key schedule: invskew x*B
127 .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603
128 .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9
129 Lk_dkse: # decryption key schedule: invskew x*E + 0x63
130 .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553
131 .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd
132 Lk_dks9: # decryption key schedule: invskew x*9
133 .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a
134 .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b
137 .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70
139 .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b
141 Lk_opt: # output transform
142 .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7
143 .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1
144 Lk_deskew: # deskew tables: inverts the sbox's "skew"
145 .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d
146 .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128
151 mflr r12 #vvvvv "distance between . and _vpaes_consts
156 .byte 0,12,0x14,0,0,0,0,0
157 .asciz "Vector Permutaion AES for AltiVec, Mike Hamburg (Stanford University)"
161 my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31));
163 my ($inp,$out,$key) = map("r$_",(3..5));
165 my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15));
166 my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19));
167 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23));
173 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
174 ## and %xmm9-%xmm15 as specified below.
177 _vpaes_encrypt_preheat:
181 li r11, 0xc0 # Lk_inv
185 vxor v7, v7, v7 # 0x00..00
186 vspltisb v8,4 # 0x04..04
187 vspltisb v9,0x0f # 0x0f..0f
206 .byte 0,12,0x14,0,0,0,0,0
211 ## AES-encrypt %xmm0.
215 ## %xmm9-%xmm15 as in _vpaes_preheat
216 ## (%rdx) = scheduled keys
219 ## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax
224 lwz r8, 240($key) # pull rounds
226 lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key
230 vperm v5, v5, v6, $keyperm # align round key
232 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
233 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1
234 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2
235 vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0
236 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
242 # middle of middle round
243 vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
244 lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
246 vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
247 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
248 andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4
249 vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
250 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
251 vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
252 lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
254 vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
255 vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
256 vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
257 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
258 vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
259 vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
260 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
264 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
265 vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
266 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
267 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
268 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
270 vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
271 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
272 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
274 lvx v6, r9, $key # vmovdqu (%r9), %xmm5
275 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
277 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
278 vperm v5, v5, v6, $keyperm # align round key
279 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
282 # middle of last round
284 # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
285 # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
286 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
287 lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
288 vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
289 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
290 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
291 vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0
294 .byte 0,12,0x14,0,0,0,0,0
296 .globl .vpaes_encrypt
299 $STU $sp,-$FRAME($sp)
300 li r10,`15+6*$SIZE_T`
301 li r11,`31+6*$SIZE_T`
303 mfspr r7, 256 # save vrsave
326 lwz r7,`$FRAME-4`($sp) # save vrsave
328 $PUSH r6,`$FRAME+$LRSAVE`($sp)
329 mtspr 256, r0 # preserve all AltiVec registers
331 bl _vpaes_encrypt_preheat
333 neg r8, $inp # prepare for unaligned access
334 lvsl $keyperm, 0, $key
335 lvsr $outperm, 0, $out
336 lvsr $inpperm, 0, r8 # -$inp
337 vnor $outmask, v7, v7 # 0xff..ff
338 lvx $inptail, 0, $inp
339 vperm $outmask, v7, $outmask, $outperm
340 addi $inp, $inp, 15 # 15 is not a typo
341 lvx $outhead, 0, $out
345 lvx $inptail, 0, $inp # redundant in aligned case
347 vperm v0, v0, $inptail, $inpperm
349 bl _vpaes_encrypt_core
351 vperm v0, v0, v0, $outperm # rotate left
352 vsel v1, $outhead, v0, $outmask
355 addi $out, $out, 15 # 15 is not a typo
358 lvx v1, 0, $out # redundant in aligned case
359 vsel v1, $outhead, v1, $outmask
362 li r10,`15+6*$SIZE_T`
363 li r11,`31+6*$SIZE_T`
365 mtspr 256, r7 # restore vrsave
391 .byte 0,12,0x04,1,0x80,0,3,0
393 .size .vpaes_encrypt,.-.vpaes_encrypt
396 _vpaes_decrypt_preheat:
400 li r11, 0xc0 # Lk_inv
404 vxor v7, v7, v7 # 0x00..00
405 vspltisb v8,4 # 0x04..04
406 vspltisb v9,0x0f # 0x0f..0f
433 .byte 0,12,0x14,0,0,0,0,0
438 ## Same API as encryption core.
442 lwz r8, 240($key) # pull rounds
444 lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key
448 vperm v5, v5, v6, $keyperm # align round key
449 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
450 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
451 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0
452 vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2
453 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
460 # Inverse mix columns
462 lvx v0, r12, r11 # v5 and v0 are flipped
463 # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
464 # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
465 vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
467 vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
469 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0
470 # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
471 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
472 # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
474 vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
475 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
476 vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
477 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
478 # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
479 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
480 # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
482 vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
483 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
484 vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
485 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
486 # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
487 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
488 # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
490 vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
491 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
492 vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
493 vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
494 vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
498 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
499 vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
500 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
501 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
502 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
504 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
505 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
506 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
508 lvx v6, r9, $key # vmovdqu (%r9), %xmm0
509 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
511 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
512 vperm v5, v5, v6, $keyperm # align round key
513 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
516 # middle of last round
518 # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
519 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
520 # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
521 lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
522 vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
523 vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
524 vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A
525 vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0
528 .byte 0,12,0x14,0,0,0,0,0
530 .globl .vpaes_decrypt
533 $STU $sp,-$FRAME($sp)
534 li r10,`15+6*$SIZE_T`
535 li r11,`31+6*$SIZE_T`
537 mfspr r7, 256 # save vrsave
560 lwz r7,`$FRAME-4`($sp) # save vrsave
562 $PUSH r6,`$FRAME+$LRSAVE`($sp)
563 mtspr 256, r0 # preserve all AltiVec registers
565 bl _vpaes_decrypt_preheat
567 neg r8, $inp # prepare for unaligned access
568 lvsl $keyperm, 0, $key
569 lvsr $outperm, 0, $out
570 lvsr $inpperm, 0, r8 # -$inp
571 vnor $outmask, v7, v7 # 0xff..ff
572 lvx $inptail, 0, $inp
573 vperm $outmask, v7, $outmask, $outperm
574 addi $inp, $inp, 15 # 15 is not a typo
575 lvx $outhead, 0, $out
579 lvx $inptail, 0, $inp # redundant in aligned case
581 vperm v0, v0, $inptail, $inpperm
583 bl _vpaes_decrypt_core
585 vperm v0, v0, v0, $outperm # rotate left
586 vsel v1, $outhead, v0, $outmask
589 addi $out, $out, 15 # 15 is not a typo
592 lvx v1, 0, $out # redundant in aligned case
593 vsel v1, $outhead, v1, $outmask
596 li r10,`15+6*$SIZE_T`
597 li r11,`31+6*$SIZE_T`
599 mtspr 256, r7 # restore vrsave
625 .byte 0,12,0x04,1,0x80,0,3,0
627 .size .vpaes_decrypt,.-.vpaes_decrypt
629 .globl .vpaes_cbc_encrypt
632 $STU $sp,-`($FRAME+2*$SIZE_T)`($sp)
634 li r10,`15+6*$SIZE_T`
635 li r11,`31+6*$SIZE_T`
659 lwz r12,`$FRAME-4`($sp) # save vrsave
660 $PUSH r30,`$FRAME+$SIZE_T*0`($sp)
661 $PUSH r31,`$FRAME+$SIZE_T*1`($sp)
663 $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
665 sub. r30, r5, r9 # copy length-16
666 mr r5, r6 # copy pointer to key
667 mr r31, r7 # copy pointer to iv
669 cmpwi r8, 0 # test direction
671 mr r7, r12 # copy vrsave
672 mtspr 256, r6 # preserve all AltiVec registers
674 lvx v24, 0, r31 # load [potentially unaligned] iv
676 lvsl $inpperm, 0, r31
678 vperm v24, v24, v25, $inpperm
680 neg r8, $inp # prepare for unaligned access
682 lvsl $keyperm, 0, $key
683 lvsr $outperm, 0, $out
684 lvsr $inpperm, 0, r8 # -$inp
685 vnor $outmask, v7, v7 # 0xff..ff
686 lvx $inptail, 0, $inp
687 vperm $outmask, v7, $outmask, $outperm
688 addi $inp, $inp, 15 # 15 is not a typo
689 lvx $outhead, 0, $out
693 bl _vpaes_encrypt_preheat
698 lvx $inptail, 0, $inp
700 vperm v0, v0, $inptail, $inpperm
701 vxor v0, v0, v24 # ^= iv
703 bl _vpaes_encrypt_core
705 vmr v24, v0 # put aside iv
706 sub. r30, r30, r0 # len -= 16
707 vperm v0, v0, v0, $outperm # rotate left
708 vsel v1, $outhead, v0, $outmask
718 bl _vpaes_decrypt_preheat
723 lvx $inptail, 0, $inp
725 vperm v0, v0, $inptail, $inpperm
726 vmr v25, v0 # put aside input
728 bl _vpaes_decrypt_core
730 vxor v0, v0, v24 # ^= iv
732 sub. r30, r30, r0 # len -= 16
733 vperm v0, v0, v0, $outperm # rotate left
734 vsel v1, $outhead, v0, $outmask
742 lvx v1, 0, $out # redundant in aligned case
743 vsel v1, $outhead, v1, $outmask
746 neg r8, r31 # write [potentially unaligned] iv
749 vnor $outmask, v7, v7 # 0xff..ff
750 vperm $outmask, v7, $outmask, $outperm
752 vperm v24, v24, v24, $outperm # rotate
753 vsel v0, $outhead, v24, $outmask
756 vsel v1, v24, v1, $outmask
759 mtspr 256, r7 # restore vrsave
760 li r10,`15+6*$SIZE_T`
761 li r11,`31+6*$SIZE_T`
785 $POP r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
786 $POP r30,`$FRAME+$SIZE_T*0`($sp)
787 $POP r31,`$FRAME+$SIZE_T*1`($sp)
789 addi $sp,$sp,`$FRAME+$SIZE_T*2`
792 .byte 0,12,0x04,1,0x80,2,6,0
794 .size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt
798 my ($inp,$bits,$out)=map("r$_",(3..5));
800 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24));
803 ########################################################
805 ## AES key schedule ##
807 ########################################################
813 li r11, 0xc0 # Lk_inv
818 vspltisb v8,4 # 0x04..04
819 vxor v9,v9,v9 # 0x00..00
820 lvx $invlo, r12, r11 # Lk_inv
824 lvx $iptlo, r12, r9 # Lk_ipt
829 lvx v14, r12, r11 # Lk_sb1
834 lvx v16, r12, r9 # Lk_dksd
838 lvx v18, r12, r11 # Lk_dksb
842 lvx v20, r12, r9 # Lk_dkse
846 lvx v22, r12, r11 # Lk_dks9
849 lvx v24, r12, r9 # Lk_rcon
850 lvx v25, 0, r12 # Lk_mc_forward[0]
851 lvx v26, r12, r8 # Lks63
854 .byte 0,12,0x14,0,0,0,0,0
857 _vpaes_schedule_core:
860 bl _vpaes_key_preheat # load the tables
862 #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned)
863 neg r8, $inp # prepare for unaligned access
865 addi $inp, $inp, 15 # 15 is not typo
866 lvsr $inpperm, 0, r8 # -$inp
867 lvx v6, 0, $inp # v6 serves as inptail
869 vperm v0, v0, v6, $inpperm
872 vmr v3, v0 # vmovdqa %xmm0, %xmm3
873 bl _vpaes_schedule_transform
874 vmr v7, v0 # vmovdqa %xmm0, %xmm7
876 bne $dir, Lschedule_am_decrypting
878 # encrypting, output zeroth round key after transform
879 li r8, 0x30 # mov \$0x30,%r8d
880 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
882 lvsr $outperm, 0, $out # prepare for unaligned access
883 vspltisb $outmask, -1 # 0xff..ff
884 lvx $outhead, 0, $out
885 vperm $outmask, v9, $outmask, $outperm
887 #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
888 vperm v1, v0, v0, $outperm # rotate left
889 vsel v2, $outhead, v1, $outmask
894 Lschedule_am_decrypting:
895 srwi r8, $bits, 1 # shr \$1,%r8d
896 andi. r8, r8, 32 # and \$32,%r8d
897 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
898 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
899 # decrypting, output zeroth round key after shiftrows
900 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
901 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
903 neg r0, $out # prepare for unaligned access
905 addi $out, $out, 15 # 15 is not typo
906 vspltisb $outmask, -1 # 0xff..ff
907 lvx $outhead, 0, $out
908 vperm $outmask, $outmask, v9, $outperm
910 #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
911 vperm v4, v4, v4, $outperm # rotate left
912 vsel v2, $outhead, v4, $outmask
915 xori r8, r8, 0x30 # xor \$0x30, %r8
918 cmplwi $bits, 192 # cmp \$192, %esi
926 ## 128-bit specific part of key schedule.
928 ## This schedule is really simple, because all its parts
929 ## are accomplished by the subroutines.
932 li r0, 10 # mov \$10, %esi
936 bl _vpaes_schedule_round
937 bdz Lschedule_mangle_last # dec %esi
938 bl _vpaes_schedule_mangle # write output
944 ## 192-bit specific part of key schedule.
946 ## The main body of this schedule is the same as the 128-bit
947 ## schedule, but with more smearing. The long, high side is
948 ## stored in %xmm7 as before, and the short, low side is in
949 ## the high bits of %xmm6.
951 ## This schedule is somewhat nastier, however, because each
952 ## round produces 192 bits of key material, or 1.5 round keys.
953 ## Therefore, on each cycle we do 2 rounds and produce 3 round
958 li r0, 4 # mov \$4, %esi
960 vperm v0, v6, v0, $inpperm
961 vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
962 bl _vpaes_schedule_transform # input transform
964 vsldoi v6, v9, v6, 8 # clobber "low" side with zeros
968 bl _vpaes_schedule_round
969 vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0
970 bl _vpaes_schedule_mangle # save key n
971 bl _vpaes_schedule_192_smear
972 bl _vpaes_schedule_mangle # save key n+1
973 bl _vpaes_schedule_round
974 bdz Lschedule_mangle_last # dec %esi
975 bl _vpaes_schedule_mangle # save key n+2
976 bl _vpaes_schedule_192_smear
982 ## 256-bit specific part of key schedule.
984 ## The structure here is very similar to the 128-bit
985 ## schedule, but with an additional "low side" in
986 ## %xmm6. The low side's rounds are the same as the
987 ## high side's, except no rcon and no rotation.
991 li r0, 7 # mov \$7, %esi
993 lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
994 vperm v0, v6, v0, $inpperm
995 bl _vpaes_schedule_transform # input transform
999 bl _vpaes_schedule_mangle # output low result
1000 vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
1003 bl _vpaes_schedule_round
1004 bdz Lschedule_mangle_last # dec %esi
1005 bl _vpaes_schedule_mangle
1007 # low round. swap xmm7 and xmm6
1008 vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
1009 vmr v5, v7 # vmovdqa %xmm7, %xmm5
1010 vmr v7, v6 # vmovdqa %xmm6, %xmm7
1011 bl _vpaes_schedule_low_round
1012 vmr v7, v5 # vmovdqa %xmm5, %xmm7
1016 ## .aes_schedule_mangle_last
1018 ## Mangler for last round of key schedule
1020 ## when encrypting, outputs out(%xmm0) ^ 63
1021 ## when decrypting, outputs unskew(%xmm0)
1023 ## Always called right before return... jumps to cleanup and exits
1026 Lschedule_mangle_last:
1027 # schedule last round key from xmm0
1028 li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11
1030 bne $dir, Lschedule_mangle_last_dec
1033 lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1
1034 li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform
1035 li r9, 0x2d0 # prepare to output transform
1036 vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute
1038 lvx $iptlo, r11, r12 # reload $ipt
1040 addi $out, $out, 16 # add \$16, %rdx
1041 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1042 bl _vpaes_schedule_transform # output transform
1044 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
1045 vperm v0, v0, v0, $outperm # rotate left
1046 vsel v2, $outhead, v0, $outmask
1050 addi $out, $out, 15 # 15 is not typo
1051 lvx v1, 0, $out # redundant in aligned case
1052 vsel v1, $outhead, v1, $outmask
1054 b Lschedule_mangle_done
1057 Lschedule_mangle_last_dec:
1058 lvx $iptlo, r11, r12 # reload $ipt
1060 addi $out, $out, -16 # add \$-16, %rdx
1061 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1062 bl _vpaes_schedule_transform # output transform
1064 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
1065 vperm v0, v0, v0, $outperm # rotate left
1066 vsel v2, $outhead, v0, $outmask
1070 addi $out, $out, -15 # -15 is not typo
1071 lvx v1, 0, $out # redundant in aligned case
1072 vsel v1, $outhead, v1, $outmask
1075 Lschedule_mangle_done:
1078 vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0
1079 vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1
1080 vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2
1081 vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3
1082 vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1083 vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5
1084 vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6
1085 vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7
1089 .byte 0,12,0x14,0,0,0,0,0
1092 ## .aes_schedule_192_smear
1094 ## Smear the short, low side in the 192-bit key schedule.
1097 ## %xmm7: high side, b a x y
1098 ## %xmm6: low side, d c 0 0
1102 ## %xmm6: b+c+d b+c 0 0
1103 ## %xmm0: b+c+d b+c b a
1106 _vpaes_schedule_192_smear:
1108 vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
1109 vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
1110 vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
1111 vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
1113 vsldoi v6, v6, v9, 8
1114 vsldoi v6, v9, v6, 8 # clobber low side with zeros
1117 .byte 0,12,0x14,0,0,0,0,0
1120 ## .aes_schedule_round
1122 ## Runs one main round of the key schedule on %xmm0, %xmm7
1124 ## Specifically, runs subbytes on the high dword of %xmm0
1125 ## then rotates it by one byte and xors into the low dword of
1128 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
1131 ## Smears the dwords of %xmm7 by xoring the low into the
1132 ## second low, result into third, result into highest.
1134 ## Returns results in %xmm7 = %xmm0.
1135 ## Clobbers %xmm1-%xmm4, %r11.
1138 _vpaes_schedule_round:
1139 # extract rcon from xmm8
1140 #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1141 vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1
1142 vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8
1143 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1146 vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
1147 vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0
1151 # low round: same as high round, but no rotation and no rcon.
1152 _vpaes_schedule_low_round:
1154 vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1
1155 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1156 vspltisb v1, 0x0f # 0x0f..0f
1157 vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4
1160 vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k
1161 vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
1162 vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7
1163 vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
1164 vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
1165 vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
1166 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
1167 vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
1168 vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7
1169 vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
1170 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
1171 vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
1172 vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io
1173 vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
1174 vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
1175 vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
1176 vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
1178 # add in smeared stuff
1179 vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0
1180 vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7
1183 .byte 0,12,0x14,0,0,0,0,0
1186 ## .aes_schedule_transform
1188 ## Linear-transform %xmm0 according to tables at (%r11)
1190 ## Requires that %xmm9 = 0x0F0F... as in preheat
1195 _vpaes_schedule_transform:
1196 #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1
1197 vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
1198 # vmovdqa (%r11), %xmm2 # lo
1199 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
1200 # vmovdqa 16(%r11), %xmm1 # hi
1201 vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0
1202 vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0
1205 .byte 0,12,0x14,0,0,0,0,0
1208 ## .aes_schedule_mangle
1210 ## Mangle xmm0 from (basis-transformed) standard version
1215 ## multiply by circulant 0,1,1,1
1216 ## apply shiftrows transform
1220 ## multiply by "inverse mixcolumns" circulant E,B,D,9
1222 ## apply shiftrows transform
1225 ## Writes out to (%rdx), and increments or decrements it
1226 ## Keeps track of round number mod 4 in %r8
1228 ## Clobbers xmm1-xmm5
1231 _vpaes_schedule_mangle:
1232 #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later
1233 # vmovdqa .Lk_mc_forward(%rip),%xmm5
1234 bne $dir, Lschedule_mangle_dec
1237 vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4
1238 addi $out, $out, 16 # add \$16, %rdx
1239 vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4
1240 vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1
1241 vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3
1242 vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4
1243 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1244 vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3
1246 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1247 addi r8, r8, -16 # add \$-16, %r8
1248 andi. r8, r8, 0x30 # and \$0x30, %r8
1250 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1251 vperm v1, v3, v3, $outperm # rotate left
1252 vsel v2, $outhead, v1, $outmask
1258 Lschedule_mangle_dec:
1259 # inverse mix columns
1260 # lea .Lk_dksd(%rip),%r11
1261 vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
1262 #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo
1264 # vmovdqa 0x00(%r11), %xmm2
1265 vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2
1266 # vmovdqa 0x10(%r11), %xmm3
1267 vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3
1268 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1269 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1271 # vmovdqa 0x20(%r11), %xmm2
1272 vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2
1273 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1274 # vmovdqa 0x30(%r11), %xmm3
1275 vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3
1276 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1277 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1279 # vmovdqa 0x40(%r11), %xmm2
1280 vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2
1281 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1282 # vmovdqa 0x50(%r11), %xmm3
1283 vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3
1284 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1286 # vmovdqa 0x60(%r11), %xmm2
1287 vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2
1288 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1289 # vmovdqa 0x70(%r11), %xmm4
1290 vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4
1291 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1292 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1293 vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3
1295 addi $out, $out, -16 # add \$-16, %rdx
1297 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1298 addi r8, r8, -16 # add \$-16, %r8
1299 andi. r8, r8, 0x30 # and \$0x30, %r8
1301 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1302 vperm v1, v3, v3, $outperm # rotate left
1303 vsel v2, $outhead, v1, $outmask
1308 .byte 0,12,0x14,0,0,0,0,0
1310 .globl .vpaes_set_encrypt_key
1312 .vpaes_set_encrypt_key:
1313 $STU $sp,-$FRAME($sp)
1314 li r10,`15+6*$SIZE_T`
1315 li r11,`31+6*$SIZE_T`
1317 mfspr r6, 256 # save vrsave
1340 lwz r6,`$FRAME-4`($sp) # save vrsave
1342 $PUSH r0, `$FRAME+$LRSAVE`($sp)
1343 mtspr 256, r7 # preserve all AltiVec registers
1345 srwi r9, $bits, 5 # shr \$5,%eax
1346 addi r9, r9, 6 # add \$5,%eax
1347 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1349 cmplw $dir, $bits, $bits
1350 li r8, 0x30 # mov \$0x30,%r8d
1351 bl _vpaes_schedule_core
1353 $POP r0, `$FRAME+$LRSAVE`($sp)
1354 li r10,`15+6*$SIZE_T`
1355 li r11,`31+6*$SIZE_T`
1356 mtspr 256, r6 # restore vrsave
1384 .byte 0,12,0x04,1,0x80,3,0
1386 .size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key
1388 .globl .vpaes_set_decrypt_key
1390 .vpaes_set_decrypt_key:
1391 $STU $sp,-$FRAME($sp)
1392 li r10,`15+6*$SIZE_T`
1393 li r11,`31+6*$SIZE_T`
1395 mfspr r6, 256 # save vrsave
1418 lwz r6,`$FRAME-4`($sp) # save vrsave
1420 $PUSH r0, `$FRAME+$LRSAVE`($sp)
1421 mtspr 256, r7 # preserve all AltiVec registers
1423 srwi r9, $bits, 5 # shr \$5,%eax
1424 addi r9, r9, 6 # add \$5,%eax
1425 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1427 slwi r9, r9, 4 # shl \$4,%eax
1428 add $out, $out, r9 # lea (%rdx,%rax),%rdx
1430 cmplwi $dir, $bits, 0
1431 srwi r8, $bits, 1 # shr \$1,%r8d
1432 andi. r8, r8, 32 # and \$32,%r8d
1433 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
1434 bl _vpaes_schedule_core
1436 $POP r0, `$FRAME+$LRSAVE`($sp)
1437 li r10,`15+6*$SIZE_T`
1438 li r11,`31+6*$SIZE_T`
1439 mtspr 256, r6 # restore vrsave
1467 .byte 0,12,0x04,1,0x80,3,0
1469 .size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key
1473 $code =~ s/\`([^\`]*)\`/eval($1)/gem;