3 ######################################################################
4 ## Constant-time SSSE3 AES core implementation.
7 ## By Mike Hamburg (Stanford University), 2009
10 ## For details see http://shiftleft.org/papers/vector_aes/ and
11 ## http://crypto.stanford.edu/vpaes/.
13 # CBC encrypt/decrypt performance in cycles per byte processed with
17 # G4e 35.5/52.1/(23.8) 11.9(*)/15.4
18 # POWER6 42.7/54.3/(28.2) 63.0/92.8(**)
19 # POWER7 32.3/42.9/(18.4) 18.5/23.3
21 # (*) This is ~10% worse than reported in paper. The reason is
22 # twofold. This module doesn't make any assumption about
23 # key schedule (or data for that matter) alignment and handles
24 # it in-line. Secondly it, being transliterated from
25 # vpaes-x86_64.pl, relies on "nested inversion" better suited
27 # (**) Inadequate POWER6 performance is due to astronomic AltiVec
28 # latency, 9 cycles per simple logical operation.
32 if ($flavour =~ /64/) {
38 } elsif ($flavour =~ /32/) {
44 } else { die "nonsense $flavour"; }
47 $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
51 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
52 die "can't locate ppc-xlate.pl";
54 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
61 .align 7 # totally strategic alignment
63 Lk_mc_forward: # mc_forward
64 .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv
65 .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv
66 .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv
67 .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv
68 Lk_mc_backward: # mc_backward
69 .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv
70 .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv
71 .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv
72 .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv
74 .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv
75 .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv
76 .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv
77 .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv
83 .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev
84 .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev
85 Lk_ipt: # input transform (lo, hi)
86 .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev
87 .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev
89 .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev
90 .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev
92 .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev
93 .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev
95 .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev
96 .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev
101 Lk_dipt: # decryption input transform
102 .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev
103 .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev
104 Lk_dsbo: # decryption sbox final output
105 .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev
106 .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev
107 Lk_dsb9: # decryption sbox output *9*u, *9*t
108 .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev
109 .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev
110 Lk_dsbd: # decryption sbox output *D*u, *D*t
111 .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev
112 .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev
113 Lk_dsbb: # decryption sbox output *B*u, *B*t
114 .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev
115 .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev
116 Lk_dsbe: # decryption sbox output *E*u, *E*t
117 .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev
118 .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev
121 ## Key schedule constants
123 Lk_dksd: # decryption key schedule: invskew x*D
124 .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev
125 .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev
126 Lk_dksb: # decryption key schedule: invskew x*B
127 .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev
128 .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev
129 Lk_dkse: # decryption key schedule: invskew x*E + 0x63
130 .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev
131 .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev
132 Lk_dks9: # decryption key schedule: invskew x*9
133 .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev
134 .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev
137 .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis
139 .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis
141 Lk_opt: # output transform
142 .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev
143 .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev
144 Lk_deskew: # deskew tables: inverts the sbox's "skew"
145 .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev
146 .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev
151 mflr r12 #vvvvv "distance between . and _vpaes_consts
156 .byte 0,12,0x14,0,0,0,0,0
157 .asciz "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)"
161 my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31));
163 my ($inp,$out,$key) = map("r$_",(3..5));
165 my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15));
166 my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19));
167 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23));
173 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
174 ## and %xmm9-%xmm15 as specified below.
177 _vpaes_encrypt_preheat:
181 li r11, 0xc0 # Lk_inv
185 vxor v7, v7, v7 # 0x00..00
186 vspltisb v8,4 # 0x04..04
187 vspltisb v9,0x0f # 0x0f..0f
206 .byte 0,12,0x14,0,0,0,0,0
211 ## AES-encrypt %xmm0.
215 ## %xmm9-%xmm15 as in _vpaes_preheat
216 ## (%rdx) = scheduled keys
219 ## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax
224 lwz r8, 240($key) # pull rounds
226 lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key
230 ?vperm v5, v5, v6, $keyperm # align round key
232 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
233 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1
234 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2
235 vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0
236 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
242 # middle of middle round
243 vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
244 lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
246 vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
247 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
248 andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4
249 vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
250 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
251 vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
252 lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
254 vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
255 vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
256 vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
257 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
258 vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
259 vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
260 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
264 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
265 vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
266 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
267 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
268 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
270 vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
271 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
272 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
274 lvx v6, r9, $key # vmovdqu (%r9), %xmm5
275 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
277 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
278 ?vperm v5, v5, v6, $keyperm # align round key
279 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
282 # middle of last round
284 # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
285 # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
286 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
287 lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
288 vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
289 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
290 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
291 vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0
294 .byte 0,12,0x14,0,0,0,0,0
296 .globl .vpaes_encrypt
299 $STU $sp,-$FRAME($sp)
300 li r10,`15+6*$SIZE_T`
301 li r11,`31+6*$SIZE_T`
303 mfspr r7, 256 # save vrsave
326 lwz r7,`$FRAME-4`($sp) # save vrsave
328 $PUSH r6,`$FRAME+$LRSAVE`($sp)
329 mtspr 256, r0 # preserve all AltiVec registers
331 bl _vpaes_encrypt_preheat
333 ?lvsl $inpperm, 0, $inp # prepare for unaligned access
335 addi $inp, $inp, 15 # 15 is not a typo
336 ?lvsr $outperm, 0, $out
337 ?lvsl $keyperm, 0, $key # prepare for unaligned access
338 vnor $outmask, v7, v7 # 0xff..ff
339 lvx $inptail, 0, $inp # redundant in aligned case
340 ?vperm $outmask, v7, $outmask, $outperm
341 lvx $outhead, 0, $out
342 ?vperm v0, v0, $inptail, $inpperm
344 bl _vpaes_encrypt_core
346 vperm v0, v0, v0, $outperm # rotate right/left
347 vsel v1, $outhead, v0, $outmask
350 addi $out, $out, 15 # 15 is not a typo
353 lvx v1, 0, $out # redundant in aligned case
354 vsel v1, $outhead, v1, $outmask
357 li r10,`15+6*$SIZE_T`
358 li r11,`31+6*$SIZE_T`
360 mtspr 256, r7 # restore vrsave
386 .byte 0,12,0x04,1,0x80,0,3,0
388 .size .vpaes_encrypt,.-.vpaes_encrypt
391 _vpaes_decrypt_preheat:
395 li r11, 0xc0 # Lk_inv
399 vxor v7, v7, v7 # 0x00..00
400 vspltisb v8,4 # 0x04..04
401 vspltisb v9,0x0f # 0x0f..0f
428 .byte 0,12,0x14,0,0,0,0,0
433 ## Same API as encryption core.
437 lwz r8, 240($key) # pull rounds
439 lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key
443 ?vperm v5, v5, v6, $keyperm # align round key
444 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
445 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
446 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0
447 vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2
448 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
455 # Inverse mix columns
457 lvx v0, r12, r11 # v5 and v0 are flipped
458 # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
459 # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
460 vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
462 vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
464 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0
465 # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
466 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
467 # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
469 vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
470 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
471 vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
472 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
473 # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
474 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
475 # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
477 vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
478 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
479 vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
480 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
481 # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
482 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
483 # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
485 vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
486 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
487 vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
488 vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
489 vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
493 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
494 vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
495 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
496 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
497 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
499 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
500 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
501 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
503 lvx v6, r9, $key # vmovdqu (%r9), %xmm0
504 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
506 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
507 ?vperm v5, v5, v6, $keyperm # align round key
508 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
511 # middle of last round
513 # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
514 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
515 # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
516 lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
517 vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
518 vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
519 vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A
520 vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0
523 .byte 0,12,0x14,0,0,0,0,0
525 .globl .vpaes_decrypt
528 $STU $sp,-$FRAME($sp)
529 li r10,`15+6*$SIZE_T`
530 li r11,`31+6*$SIZE_T`
532 mfspr r7, 256 # save vrsave
555 lwz r7,`$FRAME-4`($sp) # save vrsave
557 $PUSH r6,`$FRAME+$LRSAVE`($sp)
558 mtspr 256, r0 # preserve all AltiVec registers
560 bl _vpaes_decrypt_preheat
562 ?lvsl $inpperm, 0, $inp # prepare for unaligned access
564 addi $inp, $inp, 15 # 15 is not a typo
565 ?lvsr $outperm, 0, $out
566 ?lvsl $keyperm, 0, $key
567 vnor $outmask, v7, v7 # 0xff..ff
568 lvx $inptail, 0, $inp # redundant in aligned case
569 ?vperm $outmask, v7, $outmask, $outperm
570 lvx $outhead, 0, $out
571 ?vperm v0, v0, $inptail, $inpperm
573 bl _vpaes_decrypt_core
575 vperm v0, v0, v0, $outperm # rotate right/left
576 vsel v1, $outhead, v0, $outmask
579 addi $out, $out, 15 # 15 is not a typo
582 lvx v1, 0, $out # redundant in aligned case
583 vsel v1, $outhead, v1, $outmask
586 li r10,`15+6*$SIZE_T`
587 li r11,`31+6*$SIZE_T`
589 mtspr 256, r7 # restore vrsave
615 .byte 0,12,0x04,1,0x80,0,3,0
617 .size .vpaes_decrypt,.-.vpaes_decrypt
619 .globl .vpaes_cbc_encrypt
622 $STU $sp,-`($FRAME+2*$SIZE_T)`($sp)
624 li r10,`15+6*$SIZE_T`
625 li r11,`31+6*$SIZE_T`
649 lwz r12,`$FRAME-4`($sp) # save vrsave
650 $PUSH r30,`$FRAME+$SIZE_T*0`($sp)
651 $PUSH r31,`$FRAME+$SIZE_T*1`($sp)
653 $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
655 sub. r30, r5, r9 # copy length-16
656 mr r5, r6 # copy pointer to key
657 mr r31, r7 # copy pointer to iv
659 cmpwi r8, 0 # test direction
661 mr r7, r12 # copy vrsave
662 mtspr 256, r6 # preserve all AltiVec registers
664 lvx v24, 0, r31 # load [potentially unaligned] iv
666 ?lvsl $inpperm, 0, r31
668 ?vperm v24, v24, v25, $inpperm
670 neg r8, $inp # prepare for unaligned access
672 ?lvsl $keyperm, 0, $key
673 ?lvsr $outperm, 0, $out
674 ?lvsr $inpperm, 0, r8 # -$inp
675 vnor $outmask, v7, v7 # 0xff..ff
676 lvx $inptail, 0, $inp
677 ?vperm $outmask, v7, $outmask, $outperm
678 addi $inp, $inp, 15 # 15 is not a typo
679 lvx $outhead, 0, $out
683 bl _vpaes_encrypt_preheat
688 lvx $inptail, 0, $inp
690 ?vperm v0, v0, $inptail, $inpperm
691 vxor v0, v0, v24 # ^= iv
693 bl _vpaes_encrypt_core
695 vmr v24, v0 # put aside iv
696 sub. r30, r30, r0 # len -= 16
697 vperm v0, v0, v0, $outperm # rotate right/left
698 vsel v1, $outhead, v0, $outmask
708 bl _vpaes_decrypt_preheat
713 lvx $inptail, 0, $inp
715 ?vperm v0, v0, $inptail, $inpperm
716 vmr v25, v0 # put aside input
718 bl _vpaes_decrypt_core
720 vxor v0, v0, v24 # ^= iv
722 sub. r30, r30, r0 # len -= 16
723 vperm v0, v0, v0, $outperm # rotate right/left
724 vsel v1, $outhead, v0, $outmask
732 lvx v1, 0, $out # redundant in aligned case
733 vsel v1, $outhead, v1, $outmask
736 neg r8, r31 # write [potentially unaligned] iv
737 ?lvsl $outperm, 0, r8
739 vnor $outmask, v7, v7 # 0xff..ff
740 ?vperm $outmask, v7, $outmask, $outperm
742 vperm v24, v24, v24, $outperm # rotate right/left
743 vsel v0, $outhead, v24, $outmask
746 vsel v1, v24, v1, $outmask
749 mtspr 256, r7 # restore vrsave
750 li r10,`15+6*$SIZE_T`
751 li r11,`31+6*$SIZE_T`
775 $POP r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
776 $POP r30,`$FRAME+$SIZE_T*0`($sp)
777 $POP r31,`$FRAME+$SIZE_T*1`($sp)
779 addi $sp,$sp,`$FRAME+$SIZE_T*2`
782 .byte 0,12,0x04,1,0x80,2,6,0
784 .size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt
788 my ($inp,$bits,$out)=map("r$_",(3..5));
790 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24));
793 ########################################################
795 ## AES key schedule ##
797 ########################################################
803 li r11, 0xc0 # Lk_inv
808 vspltisb v8,4 # 0x04..04
809 vxor v9,v9,v9 # 0x00..00
810 lvx $invlo, r12, r11 # Lk_inv
814 lvx $iptlo, r12, r9 # Lk_ipt
819 lvx v14, r12, r11 # Lk_sb1
824 lvx v16, r12, r9 # Lk_dksd
828 lvx v18, r12, r11 # Lk_dksb
832 lvx v20, r12, r9 # Lk_dkse
836 lvx v22, r12, r11 # Lk_dks9
839 lvx v24, r12, r9 # Lk_rcon
840 lvx v25, 0, r12 # Lk_mc_forward[0]
841 lvx v26, r12, r8 # Lks63
844 .byte 0,12,0x14,0,0,0,0,0
847 _vpaes_schedule_core:
850 bl _vpaes_key_preheat # load the tables
852 #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned)
853 neg r8, $inp # prepare for unaligned access
855 addi $inp, $inp, 15 # 15 is not typo
856 ?lvsr $inpperm, 0, r8 # -$inp
857 lvx v6, 0, $inp # v6 serves as inptail
859 ?vperm v0, v0, v6, $inpperm
862 vmr v3, v0 # vmovdqa %xmm0, %xmm3
863 bl _vpaes_schedule_transform
864 vmr v7, v0 # vmovdqa %xmm0, %xmm7
866 bne $dir, Lschedule_am_decrypting
868 # encrypting, output zeroth round key after transform
869 li r8, 0x30 # mov \$0x30,%r8d
870 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
872 ?lvsr $outperm, 0, $out # prepare for unaligned access
873 vnor $outmask, v9, v9 # 0xff..ff
874 lvx $outhead, 0, $out
875 ?vperm $outmask, v9, $outmask, $outperm
877 #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
878 vperm v1, v0, v0, $outperm # rotate right/left
879 vsel v2, $outhead, v1, $outmask
884 Lschedule_am_decrypting:
885 srwi r8, $bits, 1 # shr \$1,%r8d
886 andi. r8, r8, 32 # and \$32,%r8d
887 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
888 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
889 # decrypting, output zeroth round key after shiftrows
890 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
891 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
893 neg r0, $out # prepare for unaligned access
894 ?lvsl $outperm, 0, r0
895 addi $out, $out, 15 # 15 is not typo
896 vnor $outmask, v9, v9 # 0xff..ff
897 lvx $outhead, 0, $out
898 ?vperm $outmask, $outmask, v9, $outperm
900 #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
901 vperm v4, v4, v4, $outperm # rotate right/left
902 vsel v2, $outhead, v4, $outmask
905 xori r8, r8, 0x30 # xor \$0x30, %r8
908 cmplwi $bits, 192 # cmp \$192, %esi
916 ## 128-bit specific part of key schedule.
918 ## This schedule is really simple, because all its parts
919 ## are accomplished by the subroutines.
922 li r0, 10 # mov \$10, %esi
926 bl _vpaes_schedule_round
927 bdz Lschedule_mangle_last # dec %esi
928 bl _vpaes_schedule_mangle # write output
934 ## 192-bit specific part of key schedule.
936 ## The main body of this schedule is the same as the 128-bit
937 ## schedule, but with more smearing. The long, high side is
938 ## stored in %xmm7 as before, and the short, low side is in
939 ## the high bits of %xmm6.
941 ## This schedule is somewhat nastier, however, because each
942 ## round produces 192 bits of key material, or 1.5 round keys.
943 ## Therefore, on each cycle we do 2 rounds and produce 3 round
948 li r0, 4 # mov \$4, %esi
950 ?vperm v0, v6, v0, $inpperm
951 ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
952 bl _vpaes_schedule_transform # input transform
953 ?vsldoi v6, v0, v9, 8
954 ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros
958 bl _vpaes_schedule_round
959 ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0
960 bl _vpaes_schedule_mangle # save key n
961 bl _vpaes_schedule_192_smear
962 bl _vpaes_schedule_mangle # save key n+1
963 bl _vpaes_schedule_round
964 bdz Lschedule_mangle_last # dec %esi
965 bl _vpaes_schedule_mangle # save key n+2
966 bl _vpaes_schedule_192_smear
972 ## 256-bit specific part of key schedule.
974 ## The structure here is very similar to the 128-bit
975 ## schedule, but with an additional "low side" in
976 ## %xmm6. The low side's rounds are the same as the
977 ## high side's, except no rcon and no rotation.
981 li r0, 7 # mov \$7, %esi
983 lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
984 ?vperm v0, v6, v0, $inpperm
985 bl _vpaes_schedule_transform # input transform
989 bl _vpaes_schedule_mangle # output low result
990 vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
993 bl _vpaes_schedule_round
994 bdz Lschedule_mangle_last # dec %esi
995 bl _vpaes_schedule_mangle
997 # low round. swap xmm7 and xmm6
998 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
999 vmr v5, v7 # vmovdqa %xmm7, %xmm5
1000 vmr v7, v6 # vmovdqa %xmm6, %xmm7
1001 bl _vpaes_schedule_low_round
1002 vmr v7, v5 # vmovdqa %xmm5, %xmm7
1006 ## .aes_schedule_mangle_last
1008 ## Mangler for last round of key schedule
1010 ## when encrypting, outputs out(%xmm0) ^ 63
1011 ## when decrypting, outputs unskew(%xmm0)
1013 ## Always called right before return... jumps to cleanup and exits
1016 Lschedule_mangle_last:
1017 # schedule last round key from xmm0
1018 li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11
1020 bne $dir, Lschedule_mangle_last_dec
1023 lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1
1024 li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform
1025 li r9, 0x2d0 # prepare to output transform
1026 vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute
1028 lvx $iptlo, r11, r12 # reload $ipt
1030 addi $out, $out, 16 # add \$16, %rdx
1031 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1032 bl _vpaes_schedule_transform # output transform
1034 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
1035 vperm v0, v0, v0, $outperm # rotate right/left
1036 vsel v2, $outhead, v0, $outmask
1040 addi $out, $out, 15 # 15 is not typo
1041 lvx v1, 0, $out # redundant in aligned case
1042 vsel v1, $outhead, v1, $outmask
1044 b Lschedule_mangle_done
1047 Lschedule_mangle_last_dec:
1048 lvx $iptlo, r11, r12 # reload $ipt
1050 addi $out, $out, -16 # add \$-16, %rdx
1051 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1052 bl _vpaes_schedule_transform # output transform
1054 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
1055 vperm v0, v0, v0, $outperm # rotate right/left
1056 vsel v2, $outhead, v0, $outmask
1060 addi $out, $out, -15 # -15 is not typo
1061 lvx v1, 0, $out # redundant in aligned case
1062 vsel v1, $outhead, v1, $outmask
1065 Lschedule_mangle_done:
1068 vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0
1069 vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1
1070 vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2
1071 vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3
1072 vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1073 vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5
1074 vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6
1075 vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7
1079 .byte 0,12,0x14,0,0,0,0,0
1082 ## .aes_schedule_192_smear
1084 ## Smear the short, low side in the 192-bit key schedule.
1087 ## %xmm7: high side, b a x y
1088 ## %xmm6: low side, d c 0 0
1092 ## %xmm6: b+c+d b+c 0 0
1093 ## %xmm0: b+c+d b+c b a
1096 _vpaes_schedule_192_smear:
1098 ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
1099 ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
1100 vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
1101 vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
1103 ?vsldoi v6, v6, v9, 8
1104 ?vsldoi v6, v9, v6, 8 # clobber low side with zeros
1107 .byte 0,12,0x14,0,0,0,0,0
1110 ## .aes_schedule_round
1112 ## Runs one main round of the key schedule on %xmm0, %xmm7
1114 ## Specifically, runs subbytes on the high dword of %xmm0
1115 ## then rotates it by one byte and xors into the low dword of
1118 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
1121 ## Smears the dwords of %xmm7 by xoring the low into the
1122 ## second low, result into third, result into highest.
1124 ## Returns results in %xmm7 = %xmm0.
1125 ## Clobbers %xmm1-%xmm4, %r11.
1128 _vpaes_schedule_round:
1129 # extract rcon from xmm8
1130 #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1131 ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1
1132 ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8
1133 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1136 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
1137 ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0
1141 # low round: same as high round, but no rotation and no rcon.
1142 _vpaes_schedule_low_round:
1144 ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1
1145 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1146 vspltisb v1, 0x0f # 0x0f..0f
1147 ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4
1150 vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k
1151 vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
1152 vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7
1153 vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
1154 vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
1155 vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
1156 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
1157 vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
1158 vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7
1159 vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
1160 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
1161 vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
1162 vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io
1163 vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
1164 vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
1165 vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
1166 vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
1168 # add in smeared stuff
1169 vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0
1170 vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7
1173 .byte 0,12,0x14,0,0,0,0,0
1176 ## .aes_schedule_transform
1178 ## Linear-transform %xmm0 according to tables at (%r11)
1180 ## Requires that %xmm9 = 0x0F0F... as in preheat
1185 _vpaes_schedule_transform:
1186 #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1
1187 vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
1188 # vmovdqa (%r11), %xmm2 # lo
1189 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
1190 # vmovdqa 16(%r11), %xmm1 # hi
1191 vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0
1192 vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0
1195 .byte 0,12,0x14,0,0,0,0,0
1198 ## .aes_schedule_mangle
1200 ## Mangle xmm0 from (basis-transformed) standard version
1205 ## multiply by circulant 0,1,1,1
1206 ## apply shiftrows transform
1210 ## multiply by "inverse mixcolumns" circulant E,B,D,9
1212 ## apply shiftrows transform
1215 ## Writes out to (%rdx), and increments or decrements it
1216 ## Keeps track of round number mod 4 in %r8
1218 ## Clobbers xmm1-xmm5
1221 _vpaes_schedule_mangle:
1222 #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later
1223 # vmovdqa .Lk_mc_forward(%rip),%xmm5
1224 bne $dir, Lschedule_mangle_dec
1227 vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4
1228 addi $out, $out, 16 # add \$16, %rdx
1229 vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4
1230 vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1
1231 vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3
1232 vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4
1233 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1234 vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3
1236 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1237 addi r8, r8, -16 # add \$-16, %r8
1238 andi. r8, r8, 0x30 # and \$0x30, %r8
1240 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1241 vperm v1, v3, v3, $outperm # rotate right/left
1242 vsel v2, $outhead, v1, $outmask
1248 Lschedule_mangle_dec:
1249 # inverse mix columns
1250 # lea .Lk_dksd(%rip),%r11
1251 vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
1252 #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo
1254 # vmovdqa 0x00(%r11), %xmm2
1255 vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2
1256 # vmovdqa 0x10(%r11), %xmm3
1257 vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3
1258 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1259 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1261 # vmovdqa 0x20(%r11), %xmm2
1262 vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2
1263 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1264 # vmovdqa 0x30(%r11), %xmm3
1265 vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3
1266 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1267 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1269 # vmovdqa 0x40(%r11), %xmm2
1270 vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2
1271 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1272 # vmovdqa 0x50(%r11), %xmm3
1273 vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3
1274 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1276 # vmovdqa 0x60(%r11), %xmm2
1277 vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2
1278 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1279 # vmovdqa 0x70(%r11), %xmm4
1280 vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4
1281 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1282 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1283 vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3
1285 addi $out, $out, -16 # add \$-16, %rdx
1287 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1288 addi r8, r8, -16 # add \$-16, %r8
1289 andi. r8, r8, 0x30 # and \$0x30, %r8
1291 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1292 vperm v1, v3, v3, $outperm # rotate right/left
1293 vsel v2, $outhead, v1, $outmask
1298 .byte 0,12,0x14,0,0,0,0,0
1300 .globl .vpaes_set_encrypt_key
1302 .vpaes_set_encrypt_key:
1303 $STU $sp,-$FRAME($sp)
1304 li r10,`15+6*$SIZE_T`
1305 li r11,`31+6*$SIZE_T`
1307 mfspr r6, 256 # save vrsave
1330 lwz r6,`$FRAME-4`($sp) # save vrsave
1332 $PUSH r0, `$FRAME+$LRSAVE`($sp)
1333 mtspr 256, r7 # preserve all AltiVec registers
1335 srwi r9, $bits, 5 # shr \$5,%eax
1336 addi r9, r9, 6 # add \$5,%eax
1337 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1339 cmplw $dir, $bits, $bits # set encrypt direction
1340 li r8, 0x30 # mov \$0x30,%r8d
1341 bl _vpaes_schedule_core
1343 $POP r0, `$FRAME+$LRSAVE`($sp)
1344 li r10,`15+6*$SIZE_T`
1345 li r11,`31+6*$SIZE_T`
1346 mtspr 256, r6 # restore vrsave
1374 .byte 0,12,0x04,1,0x80,0,3,0
1376 .size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key
1378 .globl .vpaes_set_decrypt_key
1380 .vpaes_set_decrypt_key:
1381 $STU $sp,-$FRAME($sp)
1382 li r10,`15+6*$SIZE_T`
1383 li r11,`31+6*$SIZE_T`
1385 mfspr r6, 256 # save vrsave
1408 lwz r6,`$FRAME-4`($sp) # save vrsave
1410 $PUSH r0, `$FRAME+$LRSAVE`($sp)
1411 mtspr 256, r7 # preserve all AltiVec registers
1413 srwi r9, $bits, 5 # shr \$5,%eax
1414 addi r9, r9, 6 # add \$5,%eax
1415 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1417 slwi r9, r9, 4 # shl \$4,%eax
1418 add $out, $out, r9 # lea (%rdx,%rax),%rdx
1420 cmplwi $dir, $bits, 0 # set decrypt direction
1421 srwi r8, $bits, 1 # shr \$1,%r8d
1422 andi. r8, r8, 32 # and \$32,%r8d
1423 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
1424 bl _vpaes_schedule_core
1426 $POP r0, `$FRAME+$LRSAVE`($sp)
1427 li r10,`15+6*$SIZE_T`
1428 li r11,`31+6*$SIZE_T`
1429 mtspr 256, r6 # restore vrsave
1457 .byte 0,12,0x04,1,0x80,0,3,0
1459 .size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key
1464 foreach (split("\n",$code)) {
1465 s/\`([^\`]*)\`/eval $1/geo;
1467 # constants table endian-specific conversion
1468 if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) {
1472 # convert to endian-agnostic format
1473 foreach (split(/,\s+/,$1)) {
1474 my $l = /^0/?oct:int;
1475 push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
1478 # little-endian conversion
1479 if ($flavour =~ /le$/o) {
1480 SWITCH: for($conv) {
1481 /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
1482 /\?rev/ && do { @bytes=reverse(@bytes); last; };
1487 print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
1490 $consts=0 if (m/Lconsts:/o); # end of table
1492 # instructions prefixed with '?' are endian-specific and need
1493 # to be adjusted accordingly...
1494 if ($flavour =~ /le$/o) { # little-endian
1497 s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
1498 s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
1499 s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
1500 } else { # big-endian