3 ######################################################################
4 ## Constant-time SSSE3 AES core implementation.
7 ## By Mike Hamburg (Stanford University), 2009
10 ## For details see http://shiftleft.org/papers/vector_aes/ and
11 ## http://crypto.stanford.edu/vpaes/.
13 # CBC encrypt/decrypt performance in cycles per byte processed with
17 # G4e 35.5/52.1/(23.8) 11.9(*)/15.4
18 # POWER6 42.7/54.3/(28.2) 63.0/92.8(**)
19 # POWER7 32.3/42.9/(18.4) 18.5/23.3
21 # (*) This is ~10% worse than reported in paper. The reason is
22 # twofold. This module doesn't make any assumption about
23 # key schedule (or data for that matter) alignment and handles
24 # it in-line. Secondly it, being transliterated from
25 # vpaes-x86_64.pl, relies on "nested inversion" better suited
27 # (**) Inadequate POWER6 performance is due to astronomic AltiVec
28 # latency, 9 cycles per simple logical operation.
32 if ($flavour =~ /64/) {
39 } elsif ($flavour =~ /32/) {
46 } else { die "nonsense $flavour"; }
49 $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
51 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
53 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
54 die "can't locate ppc-xlate.pl";
56 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
63 .align 7 # totally strategic alignment
65 Lk_mc_forward: # mc_forward
66 .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv
67 .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv
68 .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv
69 .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv
70 Lk_mc_backward: # mc_backward
71 .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv
72 .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv
73 .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv
74 .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv
76 .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv
77 .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv
78 .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv
79 .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv
85 .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev
86 .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev
87 Lk_ipt: # input transform (lo, hi)
88 .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev
89 .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev
91 .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev
92 .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev
94 .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev
95 .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev
97 .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev
98 .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev
103 Lk_dipt: # decryption input transform
104 .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev
105 .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev
106 Lk_dsbo: # decryption sbox final output
107 .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev
108 .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev
109 Lk_dsb9: # decryption sbox output *9*u, *9*t
110 .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev
111 .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev
112 Lk_dsbd: # decryption sbox output *D*u, *D*t
113 .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev
114 .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev
115 Lk_dsbb: # decryption sbox output *B*u, *B*t
116 .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev
117 .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev
118 Lk_dsbe: # decryption sbox output *E*u, *E*t
119 .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev
120 .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev
123 ## Key schedule constants
125 Lk_dksd: # decryption key schedule: invskew x*D
126 .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev
127 .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev
128 Lk_dksb: # decryption key schedule: invskew x*B
129 .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev
130 .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev
131 Lk_dkse: # decryption key schedule: invskew x*E + 0x63
132 .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev
133 .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev
134 Lk_dks9: # decryption key schedule: invskew x*9
135 .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev
136 .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev
139 .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis
141 .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis
143 Lk_opt: # output transform
144 .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev
145 .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev
146 Lk_deskew: # deskew tables: inverts the sbox's "skew"
147 .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev
148 .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev
153 mflr r12 #vvvvv "distance between . and _vpaes_consts
158 .byte 0,12,0x14,0,0,0,0,0
159 .asciz "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)"
163 my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31));
165 my ($inp,$out,$key) = map("r$_",(3..5));
167 my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15));
168 my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19));
169 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23));
175 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
176 ## and %xmm9-%xmm15 as specified below.
179 _vpaes_encrypt_preheat:
183 li r11, 0xc0 # Lk_inv
187 vxor v7, v7, v7 # 0x00..00
188 vspltisb v8,4 # 0x04..04
189 vspltisb v9,0x0f # 0x0f..0f
208 .byte 0,12,0x14,0,0,0,0,0
213 ## AES-encrypt %xmm0.
217 ## %xmm9-%xmm15 as in _vpaes_preheat
218 ## (%rdx) = scheduled keys
221 ## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax
226 lwz r8, 240($key) # pull rounds
228 lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key
232 ?vperm v5, v5, v6, $keyperm # align round key
234 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
235 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1
236 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2
237 vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0
238 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
244 # middle of middle round
245 vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
246 lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
248 vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
249 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
250 andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4
251 vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
252 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
253 vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
254 lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
256 vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
257 vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
258 vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
259 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
260 vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
261 vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
262 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
266 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
267 vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
268 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
269 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
270 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
272 vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
273 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
274 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
276 lvx v6, r9, $key # vmovdqu (%r9), %xmm5
277 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
279 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
280 ?vperm v5, v5, v6, $keyperm # align round key
281 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
284 # middle of last round
286 # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
287 # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
288 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
289 lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
290 vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
291 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
292 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
293 vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0
296 .byte 0,12,0x14,0,0,0,0,0
298 .globl .vpaes_encrypt
301 $STU $sp,-$FRAME($sp)
302 li r10,`15+6*$SIZE_T`
303 li r11,`31+6*$SIZE_T`
305 mfspr r7, 256 # save vrsave
328 stw r7,`$FRAME-4`($sp) # save vrsave
330 $PUSH r6,`$FRAME+$LRSAVE`($sp)
331 mtspr 256, r0 # preserve all AltiVec registers
333 bl _vpaes_encrypt_preheat
335 ?lvsl $inpperm, 0, $inp # prepare for unaligned access
337 addi $inp, $inp, 15 # 15 is not a typo
338 ?lvsr $outperm, 0, $out
339 ?lvsl $keyperm, 0, $key # prepare for unaligned access
340 lvx $inptail, 0, $inp # redundant in aligned case
341 ?vperm v0, v0, $inptail, $inpperm
343 bl _vpaes_encrypt_core
349 vperm v0, v0, v0, $outperm # rotate right/left
354 bdnz Lenc_out_unaligned
362 li r10,`15+6*$SIZE_T`
363 li r11,`31+6*$SIZE_T`
365 mtspr 256, r7 # restore vrsave
391 .byte 0,12,0x04,1,0x80,0,3,0
393 .size .vpaes_encrypt,.-.vpaes_encrypt
396 _vpaes_decrypt_preheat:
400 li r11, 0xc0 # Lk_inv
404 vxor v7, v7, v7 # 0x00..00
405 vspltisb v8,4 # 0x04..04
406 vspltisb v9,0x0f # 0x0f..0f
433 .byte 0,12,0x14,0,0,0,0,0
438 ## Same API as encryption core.
442 lwz r8, 240($key) # pull rounds
444 lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key
448 ?vperm v5, v5, v6, $keyperm # align round key
449 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
450 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
451 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0
452 vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2
453 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
460 # Inverse mix columns
462 lvx v0, r12, r11 # v5 and v0 are flipped
463 # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
464 # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
465 vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
467 vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
469 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0
470 # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
471 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
472 # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
474 vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
475 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
476 vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
477 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
478 # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
479 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
480 # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
482 vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
483 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
484 vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
485 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
486 # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
487 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
488 # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
490 vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
491 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
492 vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
493 vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
494 vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
498 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
499 vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
500 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
501 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
502 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
504 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
505 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
506 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
508 lvx v6, r9, $key # vmovdqu (%r9), %xmm0
509 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
511 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
512 ?vperm v5, v5, v6, $keyperm # align round key
513 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
516 # middle of last round
518 # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
519 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
520 # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
521 lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
522 vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
523 vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
524 vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A
525 vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0
528 .byte 0,12,0x14,0,0,0,0,0
530 .globl .vpaes_decrypt
533 $STU $sp,-$FRAME($sp)
534 li r10,`15+6*$SIZE_T`
535 li r11,`31+6*$SIZE_T`
537 mfspr r7, 256 # save vrsave
560 stw r7,`$FRAME-4`($sp) # save vrsave
562 $PUSH r6,`$FRAME+$LRSAVE`($sp)
563 mtspr 256, r0 # preserve all AltiVec registers
565 bl _vpaes_decrypt_preheat
567 ?lvsl $inpperm, 0, $inp # prepare for unaligned access
569 addi $inp, $inp, 15 # 15 is not a typo
570 ?lvsr $outperm, 0, $out
571 ?lvsl $keyperm, 0, $key
572 lvx $inptail, 0, $inp # redundant in aligned case
573 ?vperm v0, v0, $inptail, $inpperm
575 bl _vpaes_decrypt_core
581 vperm v0, v0, v0, $outperm # rotate right/left
586 bdnz Ldec_out_unaligned
594 li r10,`15+6*$SIZE_T`
595 li r11,`31+6*$SIZE_T`
597 mtspr 256, r7 # restore vrsave
623 .byte 0,12,0x04,1,0x80,0,3,0
625 .size .vpaes_decrypt,.-.vpaes_decrypt
627 .globl .vpaes_cbc_encrypt
633 $STU $sp,-`($FRAME+2*$SIZE_T)`($sp)
635 li r10,`15+6*$SIZE_T`
636 li r11,`31+6*$SIZE_T`
660 stw r12,`$FRAME-4`($sp) # save vrsave
661 $PUSH r30,`$FRAME+$SIZE_T*0`($sp)
662 $PUSH r31,`$FRAME+$SIZE_T*1`($sp)
664 $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
666 and r30, r5, r9 # copy length&-16
667 andi. r9, $out, 15 # is $out aligned?
668 mr r5, r6 # copy pointer to key
669 mr r31, r7 # copy pointer to iv
671 mcrf cr1, cr0 # put aside $out alignment flag
672 mr r7, r12 # copy vrsave
673 mtspr 256, r6 # preserve all AltiVec registers
675 lvx v24, 0, r31 # load [potentially unaligned] iv
677 ?lvsl $inpperm, 0, r31
679 ?vperm v24, v24, v25, $inpperm
681 cmpwi r8, 0 # test direction
682 neg r8, $inp # prepare for unaligned access
684 ?lvsl $keyperm, 0, $key
685 ?lvsr $outperm, 0, $out
686 ?lvsr $inpperm, 0, r8 # -$inp
687 vnor $outmask, v7, v7 # 0xff..ff
688 lvx $inptail, 0, $inp
689 ?vperm $outmask, v7, $outmask, $outperm
690 addi $inp, $inp, 15 # 15 is not a typo
694 bl _vpaes_encrypt_preheat
697 beq cr1, Lcbc_enc_loop # $out is aligned
700 lvx $inptail, 0, $inp
702 ?vperm v0, v0, $inptail, $inpperm
703 vxor v0, v0, v24 # ^= iv
705 bl _vpaes_encrypt_core
708 vmr v24, v0 # put aside iv
710 vperm $outhead, v0, v0, $outperm # rotate right/left
713 stvebx $outhead, r8, r9
718 sub. r30, r30, r0 # len -= 16
720 beq Lcbc_unaligned_done
724 lvx $inptail, 0, $inp
726 ?vperm v0, v0, $inptail, $inpperm
727 vxor v0, v0, v24 # ^= iv
729 bl _vpaes_encrypt_core
731 vmr v24, v0 # put aside iv
732 sub. r30, r30, r0 # len -= 16
733 vperm v0, v0, v0, $outperm # rotate right/left
734 vsel v1, $outhead, v0, $outmask
744 bl _vpaes_decrypt_preheat
747 beq cr1, Lcbc_dec_loop # $out is aligned
750 lvx $inptail, 0, $inp
752 ?vperm v0, v0, $inptail, $inpperm
753 vmr v25, v0 # put aside input
755 bl _vpaes_decrypt_core
758 vxor v0, v0, v24 # ^= iv
761 vperm $outhead, v0, v0, $outperm # rotate right/left
764 stvebx $outhead, r8, r9
769 sub. r30, r30, r0 # len -= 16
771 beq Lcbc_unaligned_done
775 lvx $inptail, 0, $inp
777 ?vperm v0, v0, $inptail, $inpperm
778 vmr v25, v0 # put aside input
780 bl _vpaes_decrypt_core
782 vxor v0, v0, v24 # ^= iv
784 sub. r30, r30, r0 # len -= 16
785 vperm v0, v0, v0, $outperm # rotate right/left
786 vsel v1, $outhead, v0, $outmask
793 beq cr1, Lcbc_write_iv # $out is aligned
800 stvebx $outhead, r9, $out
806 neg r8, r31 # write [potentially unaligned] iv
808 ?lvsl $outperm, 0, r8
811 vperm v24, v24, v24, $outperm # rotate right/left
812 stvewx v24, 0, r31 # ivp is at least 32-bit aligned
817 mtspr 256, r7 # restore vrsave
818 li r10,`15+6*$SIZE_T`
819 li r11,`31+6*$SIZE_T`
843 $POP r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
844 $POP r30,`$FRAME+$SIZE_T*0`($sp)
845 $POP r31,`$FRAME+$SIZE_T*1`($sp)
847 addi $sp,$sp,`$FRAME+$SIZE_T*2`
850 .byte 0,12,0x04,1,0x80,2,6,0
852 .size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt
856 my ($inp,$bits,$out)=map("r$_",(3..5));
858 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24));
861 ########################################################
863 ## AES key schedule ##
865 ########################################################
871 li r11, 0xc0 # Lk_inv
876 vspltisb v8,4 # 0x04..04
877 vxor v9,v9,v9 # 0x00..00
878 lvx $invlo, r12, r11 # Lk_inv
882 lvx $iptlo, r12, r9 # Lk_ipt
887 lvx v14, r12, r11 # Lk_sb1
892 lvx v16, r12, r9 # Lk_dksd
896 lvx v18, r12, r11 # Lk_dksb
900 lvx v20, r12, r9 # Lk_dkse
904 lvx v22, r12, r11 # Lk_dks9
907 lvx v24, r12, r9 # Lk_rcon
908 lvx v25, 0, r12 # Lk_mc_forward[0]
909 lvx v26, r12, r8 # Lks63
912 .byte 0,12,0x14,0,0,0,0,0
915 _vpaes_schedule_core:
918 bl _vpaes_key_preheat # load the tables
920 #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned)
921 neg r8, $inp # prepare for unaligned access
923 addi $inp, $inp, 15 # 15 is not typo
924 ?lvsr $inpperm, 0, r8 # -$inp
925 lvx v6, 0, $inp # v6 serves as inptail
927 ?vperm v0, v0, v6, $inpperm
930 vmr v3, v0 # vmovdqa %xmm0, %xmm3
931 bl _vpaes_schedule_transform
932 vmr v7, v0 # vmovdqa %xmm0, %xmm7
934 bne $dir, Lschedule_am_decrypting
936 # encrypting, output zeroth round key after transform
937 li r8, 0x30 # mov \$0x30,%r8d
942 ?lvsr $outperm, 0, $out # prepare for unaligned access
943 vnor $outmask, v9, v9 # 0xff..ff
944 ?vperm $outmask, v9, $outmask, $outperm
946 #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
947 vperm $outhead, v0, v0, $outperm # rotate right/left
948 stvewx $outhead, 0, $out # some are superfluous
949 stvewx $outhead, r9, $out
950 stvewx $outhead, r10, $out
951 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
952 stvewx $outhead, r11, $out
955 Lschedule_am_decrypting:
956 srwi r8, $bits, 1 # shr \$1,%r8d
957 andi. r8, r8, 32 # and \$32,%r8d
958 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
959 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
960 # decrypting, output zeroth round key after shiftrows
961 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
965 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
967 neg r0, $out # prepare for unaligned access
968 ?lvsl $outperm, 0, r0
969 vnor $outmask, v9, v9 # 0xff..ff
970 ?vperm $outmask, $outmask, v9, $outperm
972 #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
973 vperm $outhead, v4, v4, $outperm # rotate right/left
974 stvewx $outhead, 0, $out # some are superfluous
975 stvewx $outhead, r9, $out
976 stvewx $outhead, r10, $out
977 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
978 stvewx $outhead, r11, $out
979 addi $out, $out, 15 # 15 is not typo
980 xori r8, r8, 0x30 # xor \$0x30, %r8
983 cmplwi $bits, 192 # cmp \$192, %esi
991 ## 128-bit specific part of key schedule.
993 ## This schedule is really simple, because all its parts
994 ## are accomplished by the subroutines.
997 li r0, 10 # mov \$10, %esi
1001 bl _vpaes_schedule_round
1002 bdz Lschedule_mangle_last # dec %esi
1003 bl _vpaes_schedule_mangle # write output
1007 ## .aes_schedule_192
1009 ## 192-bit specific part of key schedule.
1011 ## The main body of this schedule is the same as the 128-bit
1012 ## schedule, but with more smearing. The long, high side is
1013 ## stored in %xmm7 as before, and the short, low side is in
1014 ## the high bits of %xmm6.
1016 ## This schedule is somewhat nastier, however, because each
1017 ## round produces 192 bits of key material, or 1.5 round keys.
1018 ## Therefore, on each cycle we do 2 rounds and produce 3 round
1023 li r0, 4 # mov \$4, %esi
1025 ?vperm v0, v6, v0, $inpperm
1026 ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
1027 bl _vpaes_schedule_transform # input transform
1028 ?vsldoi v6, v0, v9, 8
1029 ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros
1033 bl _vpaes_schedule_round
1034 ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0
1035 bl _vpaes_schedule_mangle # save key n
1036 bl _vpaes_schedule_192_smear
1037 bl _vpaes_schedule_mangle # save key n+1
1038 bl _vpaes_schedule_round
1039 bdz Lschedule_mangle_last # dec %esi
1040 bl _vpaes_schedule_mangle # save key n+2
1041 bl _vpaes_schedule_192_smear
1045 ## .aes_schedule_256
1047 ## 256-bit specific part of key schedule.
1049 ## The structure here is very similar to the 128-bit
1050 ## schedule, but with an additional "low side" in
1051 ## %xmm6. The low side's rounds are the same as the
1052 ## high side's, except no rcon and no rotation.
1056 li r0, 7 # mov \$7, %esi
1058 lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
1059 ?vperm v0, v6, v0, $inpperm
1060 bl _vpaes_schedule_transform # input transform
1064 bl _vpaes_schedule_mangle # output low result
1065 vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
1068 bl _vpaes_schedule_round
1069 bdz Lschedule_mangle_last # dec %esi
1070 bl _vpaes_schedule_mangle
1072 # low round. swap xmm7 and xmm6
1073 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
1074 vmr v5, v7 # vmovdqa %xmm7, %xmm5
1075 vmr v7, v6 # vmovdqa %xmm6, %xmm7
1076 bl _vpaes_schedule_low_round
1077 vmr v7, v5 # vmovdqa %xmm5, %xmm7
1081 ## .aes_schedule_mangle_last
1083 ## Mangler for last round of key schedule
1085 ## when encrypting, outputs out(%xmm0) ^ 63
1086 ## when decrypting, outputs unskew(%xmm0)
1088 ## Always called right before return... jumps to cleanup and exits
1091 Lschedule_mangle_last:
1092 # schedule last round key from xmm0
1093 li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11
1095 bne $dir, Lschedule_mangle_last_dec
1098 lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1
1099 li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform
1100 li r9, 0x2d0 # prepare to output transform
1101 vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute
1103 lvx $iptlo, r11, r12 # reload $ipt
1105 addi $out, $out, 16 # add \$16, %rdx
1106 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1107 bl _vpaes_schedule_transform # output transform
1109 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
1110 vperm v0, v0, v0, $outperm # rotate right/left
1112 vsel v2, $outhead, v0, $outmask
1116 stvewx v0, 0, $out # some (or all) are redundant
1117 stvewx v0, r10, $out
1118 stvewx v0, r11, $out
1119 stvewx v0, r12, $out
1120 b Lschedule_mangle_done
1123 Lschedule_mangle_last_dec:
1124 lvx $iptlo, r11, r12 # reload $ipt
1126 addi $out, $out, -16 # add \$-16, %rdx
1127 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1128 bl _vpaes_schedule_transform # output transform
1130 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
1131 addi r9, $out, -15 # -15 is not typo
1132 vperm v0, v0, v0, $outperm # rotate right/left
1134 vsel v2, $outhead, v0, $outmask
1138 stvewx v0, 0, r9 # some (or all) are redundant
1144 Lschedule_mangle_done:
1147 vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0
1148 vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1
1149 vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2
1150 vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3
1151 vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1152 vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5
1153 vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6
1154 vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7
1158 .byte 0,12,0x14,0,0,0,0,0
1161 ## .aes_schedule_192_smear
1163 ## Smear the short, low side in the 192-bit key schedule.
1166 ## %xmm7: high side, b a x y
1167 ## %xmm6: low side, d c 0 0
1171 ## %xmm6: b+c+d b+c 0 0
1172 ## %xmm0: b+c+d b+c b a
1175 _vpaes_schedule_192_smear:
1177 ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
1178 ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
1179 vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
1180 vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
1182 ?vsldoi v6, v6, v9, 8
1183 ?vsldoi v6, v9, v6, 8 # clobber low side with zeros
1186 .byte 0,12,0x14,0,0,0,0,0
1189 ## .aes_schedule_round
1191 ## Runs one main round of the key schedule on %xmm0, %xmm7
1193 ## Specifically, runs subbytes on the high dword of %xmm0
1194 ## then rotates it by one byte and xors into the low dword of
1197 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
1200 ## Smears the dwords of %xmm7 by xoring the low into the
1201 ## second low, result into third, result into highest.
1203 ## Returns results in %xmm7 = %xmm0.
1204 ## Clobbers %xmm1-%xmm4, %r11.
1207 _vpaes_schedule_round:
1208 # extract rcon from xmm8
1209 #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1210 ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1
1211 ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8
1212 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1215 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
1216 ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0
1220 # low round: same as high round, but no rotation and no rcon.
1221 _vpaes_schedule_low_round:
1223 ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1
1224 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1225 vspltisb v1, 0x0f # 0x0f..0f
1226 ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4
1229 vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k
1230 vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
1231 vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7
1232 vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
1233 vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
1234 vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
1235 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
1236 vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
1237 vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7
1238 vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
1239 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
1240 vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
1241 vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io
1242 vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
1243 vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
1244 vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
1245 vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
1247 # add in smeared stuff
1248 vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0
1249 vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7
1252 .byte 0,12,0x14,0,0,0,0,0
1255 ## .aes_schedule_transform
1257 ## Linear-transform %xmm0 according to tables at (%r11)
1259 ## Requires that %xmm9 = 0x0F0F... as in preheat
1264 _vpaes_schedule_transform:
1265 #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1
1266 vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
1267 # vmovdqa (%r11), %xmm2 # lo
1268 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
1269 # vmovdqa 16(%r11), %xmm1 # hi
1270 vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0
1271 vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0
1274 .byte 0,12,0x14,0,0,0,0,0
1277 ## .aes_schedule_mangle
1279 ## Mangle xmm0 from (basis-transformed) standard version
1284 ## multiply by circulant 0,1,1,1
1285 ## apply shiftrows transform
1289 ## multiply by "inverse mixcolumns" circulant E,B,D,9
1291 ## apply shiftrows transform
1294 ## Writes out to (%rdx), and increments or decrements it
1295 ## Keeps track of round number mod 4 in %r8
1297 ## Clobbers xmm1-xmm5
1300 _vpaes_schedule_mangle:
1301 #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later
1302 # vmovdqa .Lk_mc_forward(%rip),%xmm5
1303 bne $dir, Lschedule_mangle_dec
1306 vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4
1307 addi $out, $out, 16 # add \$16, %rdx
1308 vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4
1309 vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1
1310 vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3
1311 vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4
1312 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1313 vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3
1315 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1316 addi r8, r8, -16 # add \$-16, %r8
1317 andi. r8, r8, 0x30 # and \$0x30, %r8
1319 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1320 vperm v1, v3, v3, $outperm # rotate right/left
1321 vsel v2, $outhead, v1, $outmask
1327 Lschedule_mangle_dec:
1328 # inverse mix columns
1329 # lea .Lk_dksd(%rip),%r11
1330 vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
1331 #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo
1333 # vmovdqa 0x00(%r11), %xmm2
1334 vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2
1335 # vmovdqa 0x10(%r11), %xmm3
1336 vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3
1337 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1338 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1340 # vmovdqa 0x20(%r11), %xmm2
1341 vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2
1342 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1343 # vmovdqa 0x30(%r11), %xmm3
1344 vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3
1345 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1346 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1348 # vmovdqa 0x40(%r11), %xmm2
1349 vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2
1350 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1351 # vmovdqa 0x50(%r11), %xmm3
1352 vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3
1353 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1355 # vmovdqa 0x60(%r11), %xmm2
1356 vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2
1357 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1358 # vmovdqa 0x70(%r11), %xmm4
1359 vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4
1360 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1361 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1362 vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3
1364 addi $out, $out, -16 # add \$-16, %rdx
1366 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1367 addi r8, r8, -16 # add \$-16, %r8
1368 andi. r8, r8, 0x30 # and \$0x30, %r8
1370 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1371 vperm v1, v3, v3, $outperm # rotate right/left
1372 vsel v2, $outhead, v1, $outmask
1377 .byte 0,12,0x14,0,0,0,0,0
1379 .globl .vpaes_set_encrypt_key
1381 .vpaes_set_encrypt_key:
1382 $STU $sp,-$FRAME($sp)
1383 li r10,`15+6*$SIZE_T`
1384 li r11,`31+6*$SIZE_T`
1386 mfspr r6, 256 # save vrsave
1409 stw r6,`$FRAME-4`($sp) # save vrsave
1411 $PUSH r0, `$FRAME+$LRSAVE`($sp)
1412 mtspr 256, r7 # preserve all AltiVec registers
1414 srwi r9, $bits, 5 # shr \$5,%eax
1415 addi r9, r9, 6 # add \$5,%eax
1416 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1418 cmplw $dir, $bits, $bits # set encrypt direction
1419 li r8, 0x30 # mov \$0x30,%r8d
1420 bl _vpaes_schedule_core
1422 $POP r0, `$FRAME+$LRSAVE`($sp)
1423 li r10,`15+6*$SIZE_T`
1424 li r11,`31+6*$SIZE_T`
1425 mtspr 256, r6 # restore vrsave
1453 .byte 0,12,0x04,1,0x80,0,3,0
1455 .size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key
1457 .globl .vpaes_set_decrypt_key
1459 .vpaes_set_decrypt_key:
1460 $STU $sp,-$FRAME($sp)
1461 li r10,`15+6*$SIZE_T`
1462 li r11,`31+6*$SIZE_T`
1464 mfspr r6, 256 # save vrsave
1487 stw r6,`$FRAME-4`($sp) # save vrsave
1489 $PUSH r0, `$FRAME+$LRSAVE`($sp)
1490 mtspr 256, r7 # preserve all AltiVec registers
1492 srwi r9, $bits, 5 # shr \$5,%eax
1493 addi r9, r9, 6 # add \$5,%eax
1494 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1496 slwi r9, r9, 4 # shl \$4,%eax
1497 add $out, $out, r9 # lea (%rdx,%rax),%rdx
1499 cmplwi $dir, $bits, 0 # set decrypt direction
1500 srwi r8, $bits, 1 # shr \$1,%r8d
1501 andi. r8, r8, 32 # and \$32,%r8d
1502 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
1503 bl _vpaes_schedule_core
1505 $POP r0, `$FRAME+$LRSAVE`($sp)
1506 li r10,`15+6*$SIZE_T`
1507 li r11,`31+6*$SIZE_T`
1508 mtspr 256, r6 # restore vrsave
1536 .byte 0,12,0x04,1,0x80,0,3,0
1538 .size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key
1543 foreach (split("\n",$code)) {
1544 s/\`([^\`]*)\`/eval $1/geo;
1546 # constants table endian-specific conversion
1547 if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) {
1551 # convert to endian-agnostic format
1552 foreach (split(/,\s+/,$1)) {
1553 my $l = /^0/?oct:int;
1554 push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
1557 # little-endian conversion
1558 if ($flavour =~ /le$/o) {
1559 SWITCH: for($conv) {
1560 /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
1561 /\?rev/ && do { @bytes=reverse(@bytes); last; };
1566 print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
1569 $consts=0 if (m/Lconsts:/o); # end of table
1571 # instructions prefixed with '?' are endian-specific and need
1572 # to be adjusted accordingly...
1573 if ($flavour =~ /le$/o) { # little-endian
1576 s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
1577 s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
1578 s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
1579 } else { # big-endian