3 ######################################################################
4 ## Constant-time SSSE3 AES core implementation.
7 ## By Mike Hamburg (Stanford University), 2009
10 ## For details see http://shiftleft.org/papers/vector_aes/ and
11 ## http://crypto.stanford.edu/vpaes/.
13 # CBC encrypt/decrypt performance in cycles per byte processed with
17 # G4e 35.5/52.1/(23.8) 11.9(*)/15.4
18 # POWER6 42.7/54.3/(28.2) 63.0/92.8(**)
19 # POWER7 32.3/42.9/(18.4) 18.5/23.3
21 # (*) This is ~10% worse than reported in paper. The reason is
22 # twofold. This module doesn't make any assumption about
23 # key schedule (or data for that matter) alignment and handles
24 # it in-line. Secondly it, being transliterated from
25 # vpaes-x86_64.pl, relies on "nested inversion" better suited
27 # (**) Inadequate POWER6 performance is due to astronomic AltiVec
28 # latency, 9 cycles per simple logical operation.
32 if ($flavour =~ /64/) {
39 } elsif ($flavour =~ /32/) {
46 } else { die "nonsense $flavour"; }
49 $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
51 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
53 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
54 die "can't locate ppc-xlate.pl";
56 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
63 .align 7 # totally strategic alignment
65 Lk_mc_forward: # mc_forward
66 .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv
67 .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv
68 .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv
69 .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv
70 Lk_mc_backward: # mc_backward
71 .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv
72 .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv
73 .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv
74 .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv
76 .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv
77 .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv
78 .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv
79 .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv
85 .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev
86 .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev
87 Lk_ipt: # input transform (lo, hi)
88 .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev
89 .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev
91 .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev
92 .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev
94 .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev
95 .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev
97 .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev
98 .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev
103 Lk_dipt: # decryption input transform
104 .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev
105 .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev
106 Lk_dsbo: # decryption sbox final output
107 .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev
108 .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev
109 Lk_dsb9: # decryption sbox output *9*u, *9*t
110 .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev
111 .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev
112 Lk_dsbd: # decryption sbox output *D*u, *D*t
113 .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev
114 .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev
115 Lk_dsbb: # decryption sbox output *B*u, *B*t
116 .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev
117 .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev
118 Lk_dsbe: # decryption sbox output *E*u, *E*t
119 .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev
120 .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev
123 ## Key schedule constants
125 Lk_dksd: # decryption key schedule: invskew x*D
126 .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev
127 .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev
128 Lk_dksb: # decryption key schedule: invskew x*B
129 .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev
130 .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev
131 Lk_dkse: # decryption key schedule: invskew x*E + 0x63
132 .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev
133 .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev
134 Lk_dks9: # decryption key schedule: invskew x*9
135 .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev
136 .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev
139 .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis
141 .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis
143 Lk_opt: # output transform
144 .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev
145 .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev
146 Lk_deskew: # deskew tables: inverts the sbox's "skew"
147 .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev
148 .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev
153 mflr r12 #vvvvv "distance between . and _vpaes_consts
158 .byte 0,12,0x14,0,0,0,0,0
159 .asciz "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)"
163 my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31));
165 my ($inp,$out,$key) = map("r$_",(3..5));
167 my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15));
168 my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19));
169 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23));
175 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
176 ## and %xmm9-%xmm15 as specified below.
179 _vpaes_encrypt_preheat:
183 li r11, 0xc0 # Lk_inv
187 vxor v7, v7, v7 # 0x00..00
188 vspltisb v8,4 # 0x04..04
189 vspltisb v9,0x0f # 0x0f..0f
208 .byte 0,12,0x14,0,0,0,0,0
213 ## AES-encrypt %xmm0.
217 ## %xmm9-%xmm15 as in _vpaes_preheat
218 ## (%rdx) = scheduled keys
221 ## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax
226 lwz r8, 240($key) # pull rounds
228 lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key
232 ?vperm v5, v5, v6, $keyperm # align round key
234 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
235 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1
236 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2
237 vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0
238 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
244 # middle of middle round
245 vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
246 lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
248 vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
249 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
250 andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4
251 vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
252 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
253 vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
254 lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
256 vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
257 vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
258 vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
259 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
260 vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
261 vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
262 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
266 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
267 vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
268 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
269 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
270 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
272 vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
273 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
274 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
276 lvx v6, r9, $key # vmovdqu (%r9), %xmm5
277 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
279 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
280 ?vperm v5, v5, v6, $keyperm # align round key
281 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
284 # middle of last round
286 # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
287 # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
288 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
289 lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
290 vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
291 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
292 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
293 vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0
296 .byte 0,12,0x14,0,0,0,0,0
298 .globl .vpaes_encrypt
301 $STU $sp,-$FRAME($sp)
302 li r10,`15+6*$SIZE_T`
303 li r11,`31+6*$SIZE_T`
305 mfspr r7, 256 # save vrsave
328 stw r7,`$FRAME-4`($sp) # save vrsave
330 $PUSH r6,`$FRAME+$LRSAVE`($sp)
331 mtspr 256, r0 # preserve all AltiVec registers
333 bl _vpaes_encrypt_preheat
335 ?lvsl $inpperm, 0, $inp # prepare for unaligned access
337 addi $inp, $inp, 15 # 15 is not a typo
338 ?lvsr $outperm, 0, $out
339 ?lvsl $keyperm, 0, $key # prepare for unaligned access
340 vnor $outmask, v7, v7 # 0xff..ff
341 lvx $inptail, 0, $inp # redundant in aligned case
342 ?vperm $outmask, v7, $outmask, $outperm
343 lvx $outhead, 0, $out
344 ?vperm v0, v0, $inptail, $inpperm
346 bl _vpaes_encrypt_core
348 vperm v0, v0, v0, $outperm # rotate right/left
349 vsel v1, $outhead, v0, $outmask
352 addi $out, $out, 15 # 15 is not a typo
355 lvx v1, 0, $out # redundant in aligned case
356 vsel v1, $outhead, v1, $outmask
359 li r10,`15+6*$SIZE_T`
360 li r11,`31+6*$SIZE_T`
362 mtspr 256, r7 # restore vrsave
388 .byte 0,12,0x04,1,0x80,0,3,0
390 .size .vpaes_encrypt,.-.vpaes_encrypt
393 _vpaes_decrypt_preheat:
397 li r11, 0xc0 # Lk_inv
401 vxor v7, v7, v7 # 0x00..00
402 vspltisb v8,4 # 0x04..04
403 vspltisb v9,0x0f # 0x0f..0f
430 .byte 0,12,0x14,0,0,0,0,0
435 ## Same API as encryption core.
439 lwz r8, 240($key) # pull rounds
441 lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key
445 ?vperm v5, v5, v6, $keyperm # align round key
446 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
447 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
448 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0
449 vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2
450 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
457 # Inverse mix columns
459 lvx v0, r12, r11 # v5 and v0 are flipped
460 # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
461 # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
462 vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
464 vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
466 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0
467 # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
468 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
469 # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
471 vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
472 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
473 vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
474 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
475 # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
476 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
477 # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
479 vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
480 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
481 vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
482 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
483 # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
484 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
485 # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
487 vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
488 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
489 vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
490 vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
491 vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
495 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
496 vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
497 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
498 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
499 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
501 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
502 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
503 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
505 lvx v6, r9, $key # vmovdqu (%r9), %xmm0
506 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
508 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
509 ?vperm v5, v5, v6, $keyperm # align round key
510 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
513 # middle of last round
515 # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
516 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
517 # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
518 lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
519 vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
520 vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
521 vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A
522 vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0
525 .byte 0,12,0x14,0,0,0,0,0
527 .globl .vpaes_decrypt
530 $STU $sp,-$FRAME($sp)
531 li r10,`15+6*$SIZE_T`
532 li r11,`31+6*$SIZE_T`
534 mfspr r7, 256 # save vrsave
557 stw r7,`$FRAME-4`($sp) # save vrsave
559 $PUSH r6,`$FRAME+$LRSAVE`($sp)
560 mtspr 256, r0 # preserve all AltiVec registers
562 bl _vpaes_decrypt_preheat
564 ?lvsl $inpperm, 0, $inp # prepare for unaligned access
566 addi $inp, $inp, 15 # 15 is not a typo
567 ?lvsr $outperm, 0, $out
568 ?lvsl $keyperm, 0, $key
569 vnor $outmask, v7, v7 # 0xff..ff
570 lvx $inptail, 0, $inp # redundant in aligned case
571 ?vperm $outmask, v7, $outmask, $outperm
572 lvx $outhead, 0, $out
573 ?vperm v0, v0, $inptail, $inpperm
575 bl _vpaes_decrypt_core
577 vperm v0, v0, v0, $outperm # rotate right/left
578 vsel v1, $outhead, v0, $outmask
581 addi $out, $out, 15 # 15 is not a typo
584 lvx v1, 0, $out # redundant in aligned case
585 vsel v1, $outhead, v1, $outmask
588 li r10,`15+6*$SIZE_T`
589 li r11,`31+6*$SIZE_T`
591 mtspr 256, r7 # restore vrsave
617 .byte 0,12,0x04,1,0x80,0,3,0
619 .size .vpaes_decrypt,.-.vpaes_decrypt
621 .globl .vpaes_cbc_encrypt
627 $STU $sp,-`($FRAME+2*$SIZE_T)`($sp)
629 li r10,`15+6*$SIZE_T`
630 li r11,`31+6*$SIZE_T`
654 stw r12,`$FRAME-4`($sp) # save vrsave
655 $PUSH r30,`$FRAME+$SIZE_T*0`($sp)
656 $PUSH r31,`$FRAME+$SIZE_T*1`($sp)
658 $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
660 and r30, r5, r9 # copy length&-16
661 mr r5, r6 # copy pointer to key
662 mr r31, r7 # copy pointer to iv
664 cmpwi r8, 0 # test direction
666 mr r7, r12 # copy vrsave
667 mtspr 256, r6 # preserve all AltiVec registers
669 lvx v24, 0, r31 # load [potentially unaligned] iv
671 ?lvsl $inpperm, 0, r31
673 ?vperm v24, v24, v25, $inpperm
675 neg r8, $inp # prepare for unaligned access
677 ?lvsl $keyperm, 0, $key
678 ?lvsr $outperm, 0, $out
679 ?lvsr $inpperm, 0, r8 # -$inp
680 vnor $outmask, v7, v7 # 0xff..ff
681 lvx $inptail, 0, $inp
682 ?vperm $outmask, v7, $outmask, $outperm
683 addi $inp, $inp, 15 # 15 is not a typo
684 lvx $outhead, 0, $out
688 bl _vpaes_encrypt_preheat
693 lvx $inptail, 0, $inp
695 ?vperm v0, v0, $inptail, $inpperm
696 vxor v0, v0, v24 # ^= iv
698 bl _vpaes_encrypt_core
700 vmr v24, v0 # put aside iv
701 sub. r30, r30, r0 # len -= 16
702 vperm v0, v0, v0, $outperm # rotate right/left
703 vsel v1, $outhead, v0, $outmask
713 bl _vpaes_decrypt_preheat
718 lvx $inptail, 0, $inp
720 ?vperm v0, v0, $inptail, $inpperm
721 vmr v25, v0 # put aside input
723 bl _vpaes_decrypt_core
725 vxor v0, v0, v24 # ^= iv
727 sub. r30, r30, r0 # len -= 16
728 vperm v0, v0, v0, $outperm # rotate right/left
729 vsel v1, $outhead, v0, $outmask
737 lvx v1, 0, $out # redundant in aligned case
738 vsel v1, $outhead, v1, $outmask
741 neg r8, r31 # write [potentially unaligned] iv
742 ?lvsl $outperm, 0, r8
744 vnor $outmask, v7, v7 # 0xff..ff
745 ?vperm $outmask, v7, $outmask, $outperm
747 vperm v24, v24, v24, $outperm # rotate right/left
748 vsel v0, $outhead, v24, $outmask
751 vsel v1, v24, v1, $outmask
754 mtspr 256, r7 # restore vrsave
755 li r10,`15+6*$SIZE_T`
756 li r11,`31+6*$SIZE_T`
780 $POP r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
781 $POP r30,`$FRAME+$SIZE_T*0`($sp)
782 $POP r31,`$FRAME+$SIZE_T*1`($sp)
784 addi $sp,$sp,`$FRAME+$SIZE_T*2`
787 .byte 0,12,0x04,1,0x80,2,6,0
789 .size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt
793 my ($inp,$bits,$out)=map("r$_",(3..5));
795 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24));
798 ########################################################
800 ## AES key schedule ##
802 ########################################################
808 li r11, 0xc0 # Lk_inv
813 vspltisb v8,4 # 0x04..04
814 vxor v9,v9,v9 # 0x00..00
815 lvx $invlo, r12, r11 # Lk_inv
819 lvx $iptlo, r12, r9 # Lk_ipt
824 lvx v14, r12, r11 # Lk_sb1
829 lvx v16, r12, r9 # Lk_dksd
833 lvx v18, r12, r11 # Lk_dksb
837 lvx v20, r12, r9 # Lk_dkse
841 lvx v22, r12, r11 # Lk_dks9
844 lvx v24, r12, r9 # Lk_rcon
845 lvx v25, 0, r12 # Lk_mc_forward[0]
846 lvx v26, r12, r8 # Lks63
849 .byte 0,12,0x14,0,0,0,0,0
852 _vpaes_schedule_core:
855 bl _vpaes_key_preheat # load the tables
857 #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned)
858 neg r8, $inp # prepare for unaligned access
860 addi $inp, $inp, 15 # 15 is not typo
861 ?lvsr $inpperm, 0, r8 # -$inp
862 lvx v6, 0, $inp # v6 serves as inptail
864 ?vperm v0, v0, v6, $inpperm
867 vmr v3, v0 # vmovdqa %xmm0, %xmm3
868 bl _vpaes_schedule_transform
869 vmr v7, v0 # vmovdqa %xmm0, %xmm7
871 bne $dir, Lschedule_am_decrypting
873 # encrypting, output zeroth round key after transform
874 li r8, 0x30 # mov \$0x30,%r8d
875 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
877 ?lvsr $outperm, 0, $out # prepare for unaligned access
878 vnor $outmask, v9, v9 # 0xff..ff
879 lvx $outhead, 0, $out
880 ?vperm $outmask, v9, $outmask, $outperm
882 #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
883 vperm v1, v0, v0, $outperm # rotate right/left
884 vsel v2, $outhead, v1, $outmask
889 Lschedule_am_decrypting:
890 srwi r8, $bits, 1 # shr \$1,%r8d
891 andi. r8, r8, 32 # and \$32,%r8d
892 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
893 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
894 # decrypting, output zeroth round key after shiftrows
895 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
896 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
898 neg r0, $out # prepare for unaligned access
899 ?lvsl $outperm, 0, r0
900 addi $out, $out, 15 # 15 is not typo
901 vnor $outmask, v9, v9 # 0xff..ff
902 lvx $outhead, 0, $out
903 ?vperm $outmask, $outmask, v9, $outperm
905 #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
906 vperm v4, v4, v4, $outperm # rotate right/left
907 vsel v2, $outhead, v4, $outmask
910 xori r8, r8, 0x30 # xor \$0x30, %r8
913 cmplwi $bits, 192 # cmp \$192, %esi
921 ## 128-bit specific part of key schedule.
923 ## This schedule is really simple, because all its parts
924 ## are accomplished by the subroutines.
927 li r0, 10 # mov \$10, %esi
931 bl _vpaes_schedule_round
932 bdz Lschedule_mangle_last # dec %esi
933 bl _vpaes_schedule_mangle # write output
939 ## 192-bit specific part of key schedule.
941 ## The main body of this schedule is the same as the 128-bit
942 ## schedule, but with more smearing. The long, high side is
943 ## stored in %xmm7 as before, and the short, low side is in
944 ## the high bits of %xmm6.
946 ## This schedule is somewhat nastier, however, because each
947 ## round produces 192 bits of key material, or 1.5 round keys.
948 ## Therefore, on each cycle we do 2 rounds and produce 3 round
953 li r0, 4 # mov \$4, %esi
955 ?vperm v0, v6, v0, $inpperm
956 ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
957 bl _vpaes_schedule_transform # input transform
958 ?vsldoi v6, v0, v9, 8
959 ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros
963 bl _vpaes_schedule_round
964 ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0
965 bl _vpaes_schedule_mangle # save key n
966 bl _vpaes_schedule_192_smear
967 bl _vpaes_schedule_mangle # save key n+1
968 bl _vpaes_schedule_round
969 bdz Lschedule_mangle_last # dec %esi
970 bl _vpaes_schedule_mangle # save key n+2
971 bl _vpaes_schedule_192_smear
977 ## 256-bit specific part of key schedule.
979 ## The structure here is very similar to the 128-bit
980 ## schedule, but with an additional "low side" in
981 ## %xmm6. The low side's rounds are the same as the
982 ## high side's, except no rcon and no rotation.
986 li r0, 7 # mov \$7, %esi
988 lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
989 ?vperm v0, v6, v0, $inpperm
990 bl _vpaes_schedule_transform # input transform
994 bl _vpaes_schedule_mangle # output low result
995 vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
998 bl _vpaes_schedule_round
999 bdz Lschedule_mangle_last # dec %esi
1000 bl _vpaes_schedule_mangle
1002 # low round. swap xmm7 and xmm6
1003 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
1004 vmr v5, v7 # vmovdqa %xmm7, %xmm5
1005 vmr v7, v6 # vmovdqa %xmm6, %xmm7
1006 bl _vpaes_schedule_low_round
1007 vmr v7, v5 # vmovdqa %xmm5, %xmm7
1011 ## .aes_schedule_mangle_last
1013 ## Mangler for last round of key schedule
1015 ## when encrypting, outputs out(%xmm0) ^ 63
1016 ## when decrypting, outputs unskew(%xmm0)
1018 ## Always called right before return... jumps to cleanup and exits
1021 Lschedule_mangle_last:
1022 # schedule last round key from xmm0
1023 li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11
1025 bne $dir, Lschedule_mangle_last_dec
1028 lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1
1029 li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform
1030 li r9, 0x2d0 # prepare to output transform
1031 vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute
1033 lvx $iptlo, r11, r12 # reload $ipt
1035 addi $out, $out, 16 # add \$16, %rdx
1036 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1037 bl _vpaes_schedule_transform # output transform
1039 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
1040 vperm v0, v0, v0, $outperm # rotate right/left
1041 vsel v2, $outhead, v0, $outmask
1045 addi $out, $out, 15 # 15 is not typo
1046 lvx v1, 0, $out # redundant in aligned case
1047 vsel v1, $outhead, v1, $outmask
1049 b Lschedule_mangle_done
1052 Lschedule_mangle_last_dec:
1053 lvx $iptlo, r11, r12 # reload $ipt
1055 addi $out, $out, -16 # add \$-16, %rdx
1056 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1057 bl _vpaes_schedule_transform # output transform
1059 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
1060 vperm v0, v0, v0, $outperm # rotate right/left
1061 vsel v2, $outhead, v0, $outmask
1065 addi $out, $out, -15 # -15 is not typo
1066 lvx v1, 0, $out # redundant in aligned case
1067 vsel v1, $outhead, v1, $outmask
1070 Lschedule_mangle_done:
1073 vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0
1074 vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1
1075 vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2
1076 vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3
1077 vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1078 vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5
1079 vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6
1080 vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7
1084 .byte 0,12,0x14,0,0,0,0,0
1087 ## .aes_schedule_192_smear
1089 ## Smear the short, low side in the 192-bit key schedule.
1092 ## %xmm7: high side, b a x y
1093 ## %xmm6: low side, d c 0 0
1097 ## %xmm6: b+c+d b+c 0 0
1098 ## %xmm0: b+c+d b+c b a
1101 _vpaes_schedule_192_smear:
1103 ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
1104 ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
1105 vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
1106 vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
1108 ?vsldoi v6, v6, v9, 8
1109 ?vsldoi v6, v9, v6, 8 # clobber low side with zeros
1112 .byte 0,12,0x14,0,0,0,0,0
1115 ## .aes_schedule_round
1117 ## Runs one main round of the key schedule on %xmm0, %xmm7
1119 ## Specifically, runs subbytes on the high dword of %xmm0
1120 ## then rotates it by one byte and xors into the low dword of
1123 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
1126 ## Smears the dwords of %xmm7 by xoring the low into the
1127 ## second low, result into third, result into highest.
1129 ## Returns results in %xmm7 = %xmm0.
1130 ## Clobbers %xmm1-%xmm4, %r11.
1133 _vpaes_schedule_round:
1134 # extract rcon from xmm8
1135 #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1136 ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1
1137 ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8
1138 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1141 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
1142 ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0
1146 # low round: same as high round, but no rotation and no rcon.
1147 _vpaes_schedule_low_round:
1149 ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1
1150 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1151 vspltisb v1, 0x0f # 0x0f..0f
1152 ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4
1155 vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k
1156 vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
1157 vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7
1158 vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
1159 vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
1160 vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
1161 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
1162 vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
1163 vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7
1164 vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
1165 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
1166 vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
1167 vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io
1168 vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
1169 vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
1170 vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
1171 vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
1173 # add in smeared stuff
1174 vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0
1175 vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7
1178 .byte 0,12,0x14,0,0,0,0,0
1181 ## .aes_schedule_transform
1183 ## Linear-transform %xmm0 according to tables at (%r11)
1185 ## Requires that %xmm9 = 0x0F0F... as in preheat
1190 _vpaes_schedule_transform:
1191 #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1
1192 vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
1193 # vmovdqa (%r11), %xmm2 # lo
1194 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
1195 # vmovdqa 16(%r11), %xmm1 # hi
1196 vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0
1197 vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0
1200 .byte 0,12,0x14,0,0,0,0,0
1203 ## .aes_schedule_mangle
1205 ## Mangle xmm0 from (basis-transformed) standard version
1210 ## multiply by circulant 0,1,1,1
1211 ## apply shiftrows transform
1215 ## multiply by "inverse mixcolumns" circulant E,B,D,9
1217 ## apply shiftrows transform
1220 ## Writes out to (%rdx), and increments or decrements it
1221 ## Keeps track of round number mod 4 in %r8
1223 ## Clobbers xmm1-xmm5
1226 _vpaes_schedule_mangle:
1227 #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later
1228 # vmovdqa .Lk_mc_forward(%rip),%xmm5
1229 bne $dir, Lschedule_mangle_dec
1232 vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4
1233 addi $out, $out, 16 # add \$16, %rdx
1234 vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4
1235 vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1
1236 vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3
1237 vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4
1238 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1239 vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3
1241 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1242 addi r8, r8, -16 # add \$-16, %r8
1243 andi. r8, r8, 0x30 # and \$0x30, %r8
1245 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1246 vperm v1, v3, v3, $outperm # rotate right/left
1247 vsel v2, $outhead, v1, $outmask
1253 Lschedule_mangle_dec:
1254 # inverse mix columns
1255 # lea .Lk_dksd(%rip),%r11
1256 vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
1257 #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo
1259 # vmovdqa 0x00(%r11), %xmm2
1260 vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2
1261 # vmovdqa 0x10(%r11), %xmm3
1262 vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3
1263 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1264 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1266 # vmovdqa 0x20(%r11), %xmm2
1267 vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2
1268 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1269 # vmovdqa 0x30(%r11), %xmm3
1270 vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3
1271 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1272 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1274 # vmovdqa 0x40(%r11), %xmm2
1275 vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2
1276 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1277 # vmovdqa 0x50(%r11), %xmm3
1278 vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3
1279 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1281 # vmovdqa 0x60(%r11), %xmm2
1282 vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2
1283 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1284 # vmovdqa 0x70(%r11), %xmm4
1285 vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4
1286 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1287 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1288 vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3
1290 addi $out, $out, -16 # add \$-16, %rdx
1292 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1293 addi r8, r8, -16 # add \$-16, %r8
1294 andi. r8, r8, 0x30 # and \$0x30, %r8
1296 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1297 vperm v1, v3, v3, $outperm # rotate right/left
1298 vsel v2, $outhead, v1, $outmask
1303 .byte 0,12,0x14,0,0,0,0,0
1305 .globl .vpaes_set_encrypt_key
1307 .vpaes_set_encrypt_key:
1308 $STU $sp,-$FRAME($sp)
1309 li r10,`15+6*$SIZE_T`
1310 li r11,`31+6*$SIZE_T`
1312 mfspr r6, 256 # save vrsave
1335 stw r6,`$FRAME-4`($sp) # save vrsave
1337 $PUSH r0, `$FRAME+$LRSAVE`($sp)
1338 mtspr 256, r7 # preserve all AltiVec registers
1340 srwi r9, $bits, 5 # shr \$5,%eax
1341 addi r9, r9, 6 # add \$5,%eax
1342 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1344 cmplw $dir, $bits, $bits # set encrypt direction
1345 li r8, 0x30 # mov \$0x30,%r8d
1346 bl _vpaes_schedule_core
1348 $POP r0, `$FRAME+$LRSAVE`($sp)
1349 li r10,`15+6*$SIZE_T`
1350 li r11,`31+6*$SIZE_T`
1351 mtspr 256, r6 # restore vrsave
1379 .byte 0,12,0x04,1,0x80,0,3,0
1381 .size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key
1383 .globl .vpaes_set_decrypt_key
1385 .vpaes_set_decrypt_key:
1386 $STU $sp,-$FRAME($sp)
1387 li r10,`15+6*$SIZE_T`
1388 li r11,`31+6*$SIZE_T`
1390 mfspr r6, 256 # save vrsave
1413 stw r6,`$FRAME-4`($sp) # save vrsave
1415 $PUSH r0, `$FRAME+$LRSAVE`($sp)
1416 mtspr 256, r7 # preserve all AltiVec registers
1418 srwi r9, $bits, 5 # shr \$5,%eax
1419 addi r9, r9, 6 # add \$5,%eax
1420 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1422 slwi r9, r9, 4 # shl \$4,%eax
1423 add $out, $out, r9 # lea (%rdx,%rax),%rdx
1425 cmplwi $dir, $bits, 0 # set decrypt direction
1426 srwi r8, $bits, 1 # shr \$1,%r8d
1427 andi. r8, r8, 32 # and \$32,%r8d
1428 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
1429 bl _vpaes_schedule_core
1431 $POP r0, `$FRAME+$LRSAVE`($sp)
1432 li r10,`15+6*$SIZE_T`
1433 li r11,`31+6*$SIZE_T`
1434 mtspr 256, r6 # restore vrsave
1462 .byte 0,12,0x04,1,0x80,0,3,0
1464 .size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key
1469 foreach (split("\n",$code)) {
1470 s/\`([^\`]*)\`/eval $1/geo;
1472 # constants table endian-specific conversion
1473 if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) {
1477 # convert to endian-agnostic format
1478 foreach (split(/,\s+/,$1)) {
1479 my $l = /^0/?oct:int;
1480 push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
1483 # little-endian conversion
1484 if ($flavour =~ /le$/o) {
1485 SWITCH: for($conv) {
1486 /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
1487 /\?rev/ && do { @bytes=reverse(@bytes); last; };
1492 print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
1495 $consts=0 if (m/Lconsts:/o); # end of table
1497 # instructions prefixed with '?' are endian-specific and need
1498 # to be adjusted accordingly...
1499 if ($flavour =~ /le$/o) { # little-endian
1502 s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
1503 s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
1504 s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
1505 } else { # big-endian