3 ######################################################################
4 ## Constant-time SSSE3 AES core implementation.
7 ## By Mike Hamburg (Stanford University), 2009
10 ## For details see http://shiftleft.org/papers/vector_aes/ and
11 ## http://crypto.stanford.edu/vpaes/.
13 # CBC encrypt/decrypt performance in cycles per byte processed with
17 # G4e 35.5/52.1/(23.8) 11.9(*)/15.4
18 # POWER6 42.7/54.3/(28.2) 63.0/92.8(**)
19 # POWER7 32.3/42.9/(18.4) 18.5/23.3
21 # (*) This is ~10% worse than reported in paper. The reason is
22 # twofold. This module doesn't make any assumption about
23 # key schedule (or data for that matter) alignment and handles
24 # it in-line. Secondly it, being transliterated from
25 # vpaes-x86_64.pl, relies on "nested inversion" better suited
27 # (**) Inadequate POWER6 performance is due to astronomic AltiVec
28 # latency, 9 cycles per simple logical operation.
32 if ($flavour =~ /64/) {
38 } elsif ($flavour =~ /32/) {
44 } else { die "nonsense $flavour"; }
49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
51 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
52 die "can't locate ppc-xlate.pl";
54 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
61 .align 7 # totally strategic alignment
63 Lk_mc_forward: # mc_forward
64 .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c
65 .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300
66 .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704
67 .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08
68 Lk_mc_backward: # mc_backward
69 .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e
70 .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a
71 .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506
72 .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102
74 .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
75 .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b
76 .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07
77 .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603
83 .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704
84 .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03
85 Lk_ipt: # input transform (lo, hi)
86 .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca
87 .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd
89 .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15
90 .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e
92 .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b
93 .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5
95 .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2
96 .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e
101 Lk_dipt: # decryption input transform
102 .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15
103 .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712
104 Lk_dsbo: # decryption sbox final output
105 .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7
106 .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca
107 Lk_dsb9: # decryption sbox output *9*u, *9*t
108 .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca
109 .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72
110 Lk_dsbd: # decryption sbox output *D*u, *D*t
111 .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5
112 .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129
113 Lk_dsbb: # decryption sbox output *B*u, *B*t
114 .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660
115 .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3
116 Lk_dsbe: # decryption sbox output *E*u, *E*t
117 .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222
118 .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794
121 ## Key schedule constants
123 Lk_dksd: # decryption key schedule: invskew x*D
124 .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007
125 .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f
126 Lk_dksb: # decryption key schedule: invskew x*B
127 .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603
128 .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9
129 Lk_dkse: # decryption key schedule: invskew x*E + 0x63
130 .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553
131 .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd
132 Lk_dks9: # decryption key schedule: invskew x*9
133 .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a
134 .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b
137 .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70
139 .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b
141 Lk_opt: # output transform
142 .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7
143 .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1
144 Lk_deskew: # deskew tables: inverts the sbox's "skew"
145 .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d
146 .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128
151 mflr r12 #vvvvv "distance between . and _vpaes_consts
156 .byte 0,12,0x14,0,0,0,0,0
157 .asciz "Vector Permutaion AES for AltiVec, Mike Hamburg (Stanford University)"
161 my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31));
163 my ($inp,$out,$key) = map("r$_",(3..5));
165 my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15));
166 my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19));
167 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23));
173 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
174 ## and %xmm9-%xmm15 as specified below.
177 _vpaes_encrypt_preheat:
181 li r11, 0xc0 # Lk_inv
185 vxor v7, v7, v7 # 0x00..00
186 vspltisb v8,4 # 0x04..04
187 vspltisb v9,0x0f # 0x0f..0f
206 .byte 0,12,0x14,0,0,0,0,0
211 ## AES-encrypt %xmm0.
215 ## %xmm9-%xmm15 as in _vpaes_preheat
216 ## (%rdx) = scheduled keys
219 ## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax
224 lwz r8, 240($key) # pull rounds
226 lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key
230 vperm v5, v5, v6, $keyperm # align round key
232 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
233 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1
234 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2
235 vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0
236 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
242 # middle of middle round
243 vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
244 lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
246 vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
247 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
248 andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4
249 vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
250 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
251 vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
252 lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
254 vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
255 vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
256 vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
257 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
258 vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
259 vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
260 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
264 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
265 vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
266 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
267 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
268 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
270 vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
271 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
272 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
274 lvx v6, r9, $key # vmovdqu (%r9), %xmm5
275 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
277 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
278 vperm v5, v5, v6, $keyperm # align round key
279 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
282 # middle of last round
284 # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
285 # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
286 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
287 lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
288 vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
289 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
290 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
291 vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0
294 .byte 0,12,0x14,0,0,0,0,0
296 .globl .vpaes_encrypt
300 mfspr r7, 256 # save vrsave
302 $PUSH r6,$LRSAVE($sp)
303 mtspr 256, r0 # preserve all AltiVec registers
305 bl _vpaes_encrypt_preheat
308 neg r8, $inp # prepare for unaligned access
309 lvsl $keyperm, 0, $key
310 lvsr $outperm, 0, $out
311 lvsr $inpperm, 0, r8 # -$inp
312 vnor $outmask, v7, v7 # 0xff..ff
313 lvx $inptail, 0, $inp
314 vperm $outmask, v7, $outmask, $outperm
315 addi $inp, $inp, 15 # 15 is not a typo
316 lvx $outhead, 0, $out
320 lvx $inptail, 0, $inp # redundant in aligned case
322 vperm v0, v0, $inptail, $inpperm
324 bl _vpaes_encrypt_core
326 vperm v0, v0, v0, $outperm # rotate left
327 vsel v1, $outhead, v0, $outmask
330 addi $out, $out, 15 # 15 is not a typo
333 lvx v1, 0, $out # redundant in aligned case
334 vsel v1, $outhead, v1, $outmask
338 mtspr 256, r7 # restore vrsave
341 .byte 0,12,0x14,1,0,0,3,0
343 .size .vpaes_encrypt,.-.vpaes_encrypt
346 _vpaes_decrypt_preheat:
350 li r11, 0xc0 # Lk_inv
354 vxor v7, v7, v7 # 0x00..00
355 vspltisb v8,4 # 0x04..04
356 vspltisb v9,0x0f # 0x0f..0f
383 .byte 0,12,0x14,0,0,0,0,0
388 ## Same API as encryption core.
392 lwz r8, 240($key) # pull rounds
394 lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key
398 vperm v5, v5, v6, $keyperm # align round key
399 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
400 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
401 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0
402 vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2
403 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
410 # Inverse mix columns
412 lvx v0, r12, r11 # v5 and v0 are flipped
413 # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
414 # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
415 vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
417 vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
419 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0
420 # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
421 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
422 # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
424 vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
425 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
426 vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
427 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
428 # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
429 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
430 # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
432 vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
433 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
434 vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
435 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
436 # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
437 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
438 # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
440 vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
441 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
442 vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
443 vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
444 vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
448 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
449 vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
450 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
451 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
452 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
454 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
455 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
456 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
458 lvx v6, r9, $key # vmovdqu (%r9), %xmm0
459 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
461 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
462 vperm v5, v5, v6, $keyperm # align round key
463 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
466 # middle of last round
468 # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
469 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
470 # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
471 lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
472 vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
473 vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
474 vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A
475 vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0
478 .byte 0,12,0x14,0,0,0,0,0
480 .globl .vpaes_decrypt
484 mfspr r7, 256 # save vrsave
486 $PUSH r6,$LRSAVE($sp)
487 mtspr 256, r0 # preserve all AltiVec registers
489 bl _vpaes_decrypt_preheat
492 neg r8, $inp # prepare for unaligned access
493 lvsl $keyperm, 0, $key
494 lvsr $outperm, 0, $out
495 lvsr $inpperm, 0, r8 # -$inp
496 vnor $outmask, v7, v7 # 0xff..ff
497 lvx $inptail, 0, $inp
498 vperm $outmask, v7, $outmask, $outperm
499 addi $inp, $inp, 15 # 15 is not a typo
500 lvx $outhead, 0, $out
504 lvx $inptail, 0, $inp # redundant in aligned case
506 vperm v0, v0, $inptail, $inpperm
508 bl _vpaes_decrypt_core
510 vperm v0, v0, v0, $outperm # rotate left
511 vsel v1, $outhead, v0, $outmask
514 addi $out, $out, 15 # 15 is not a typo
517 lvx v1, 0, $out # redundant in aligned case
518 vsel v1, $outhead, v1, $outmask
522 mtspr 256, r7 # restore vrsave
525 .byte 0,12,0x14,1,0,0,3,0
527 .size .vpaes_decrypt,.-.vpaes_decrypt
529 .globl .vpaes_cbc_encrypt
532 $STU $sp,-$FRAME($sp)
534 $PUSH r30,$FRAME-$SIZE_T*2($sp)
536 $PUSH r31,$FRAME-$SIZE_T*1($sp)
537 $PUSH r0, $FRAME+$LRSAVE($sp)
539 sub. r30, r5, r9 # copy length-16
540 mr r5, r6 # copy pointer to key
541 mr r31, r7 # copy pointer to iv
543 cmpwi r8, 0 # test direction
546 mtspr 256, r6 # preserve all AltiVec registers
548 neg r8, r31 # load [potentially unaligned] iv
551 lvsr $inpperm, 0, r8 # -ivp
553 vperm v24, v24, v25, $inpperm
556 neg r8, $inp # prepare for unaligned access
558 lvsl $keyperm, 0, $key
559 lvsr $outperm, 0, $out
560 lvsr $inpperm, 0, r8 # -$inp
561 vnor $outmask, v7, v7 # 0xff..ff
562 lvx $inptail, 0, $inp
563 vperm $outmask, v7, $outmask, $outperm
564 addi $inp, $inp, 15 # 15 is not a typo
565 lvx $outhead, 0, $out
569 bl _vpaes_encrypt_preheat
574 lvx $inptail, 0, $inp
576 vperm v0, v0, $inptail, $inpperm
577 vxor v0, v0, v24 # ^= iv
579 bl _vpaes_encrypt_core
581 vmr v24, v0 # put aside iv
582 sub. r30, r30, r0 # len -= 16
583 vperm v0, v0, v0, $outperm # rotate left
584 vsel v1, $outhead, v0, $outmask
594 bl _vpaes_decrypt_preheat
599 lvx $inptail, 0, $inp
601 vperm v0, v0, $inptail, $inpperm
602 vmr v25, v0 # put aside input
604 bl _vpaes_decrypt_core
606 vxor v0, v0, v24 # ^= iv
608 sub. r30, r30, r0 # len -= 16
609 vperm v0, v0, v0, $outperm # rotate left
610 vsel v1, $outhead, v0, $outmask
618 lvx v1, 0, $out # redundant in aligned case
619 vsel v1, $outhead, v1, $outmask
622 lvsr $outperm, 0, r31 # write [potentially unaligned] iv
624 vnor $outmask, v7, v7 # 0xff..ff
625 vperm $outmask, v7, $outmask, $outperm
626 lvx $outhead, 0, $out
627 vperm v24, v24, v24, $outperm # rotate
628 vsel v0, $outhead, v24, $outmask
631 vsel v1, v24, v1, $outmask
634 mtspr 256, r7 # restore vrsave
636 $POP r0, $FRAME+$LRSAVE($sp)
637 $POP r30,$FRAME-$SIZE_T*2($sp)
638 $POP r31,$FRAME-$SIZE_T*1($sp)
643 .byte 0,12,0x04,1,0x80,2,6,0
645 .size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt
649 my ($inp,$bits,$out)=map("r$_",(3..5));
651 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24));
654 ########################################################
656 ## AES key schedule ##
658 ########################################################
664 li r11, 0xc0 # Lk_inv
669 vspltisb v8,4 # 0x04..04
670 vxor v9,v9,v9 # 0x00..00
671 lvx $invlo, r12, r11 # Lk_inv
675 lvx $iptlo, r12, r9 # Lk_ipt
680 lvx v14, r12, r11 # Lk_sb1
685 lvx v16, r12, r9 # Lk_dksd
689 lvx v18, r12, r11 # Lk_dksb
693 lvx v20, r12, r9 # Lk_dkse
697 lvx v22, r12, r11 # Lk_dks9
700 lvx v24, r12, r9 # Lk_rcon
701 lvx v25, 0, r12 # Lk_mc_forward[0]
702 lvx v26, r12, r8 # Lks63
705 .byte 0,12,0x14,0,0,0,0,0
708 _vpaes_schedule_core:
711 bl _vpaes_key_preheat # load the tables
713 #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned)
714 neg r8, $inp # prepare for unaligned access
716 addi $inp, $inp, 15 # 15 is not typo
717 lvsr $inpperm, 0, r8 # -$inp
718 lvx v6, 0, $inp # v6 serves as inptail
720 vperm v0, v0, v6, $inpperm
723 vmr v3, v0 # vmovdqa %xmm0, %xmm3
724 bl _vpaes_schedule_transform
725 vmr v7, v0 # vmovdqa %xmm0, %xmm7
727 bne $dir, Lschedule_am_decrypting
729 # encrypting, output zeroth round key after transform
730 li r8, 0x30 # mov \$0x30,%r8d
731 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
733 lvsr $outperm, 0, $out # prepare for unaligned access
734 vspltisb $outmask, -1 # 0xff..ff
735 lvx $outhead, 0, $out
736 vperm $outmask, v9, $outmask, $outperm
738 #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
739 vperm v1, v0, v0, $outperm # rotate left
740 vsel v2, $outhead, v1, $outmask
745 Lschedule_am_decrypting:
746 srwi r8, $bits, 1 # shr \$1,%r8d
747 andi. r8, r8, 32 # and \$32,%r8d
748 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
749 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
750 # decrypting, output zeroth round key after shiftrows
751 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
752 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
754 neg r0, $out # prepare for unaligned access
756 addi $out, $out, 15 # 15 is not typo
757 vspltisb $outmask, -1 # 0xff..ff
758 lvx $outhead, 0, $out
759 vperm $outmask, $outmask, v9, $outperm
761 #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
762 vperm v4, v4, v4, $outperm # rotate left
763 vsel v2, $outhead, v4, $outmask
766 xori r8, r8, 0x30 # xor \$0x30, %r8
769 cmplwi $bits, 192 # cmp \$192, %esi
777 ## 128-bit specific part of key schedule.
779 ## This schedule is really simple, because all its parts
780 ## are accomplished by the subroutines.
783 li r0, 10 # mov \$10, %esi
787 bl _vpaes_schedule_round
788 bdz Lschedule_mangle_last # dec %esi
789 bl _vpaes_schedule_mangle # write output
795 ## 192-bit specific part of key schedule.
797 ## The main body of this schedule is the same as the 128-bit
798 ## schedule, but with more smearing. The long, high side is
799 ## stored in %xmm7 as before, and the short, low side is in
800 ## the high bits of %xmm6.
802 ## This schedule is somewhat nastier, however, because each
803 ## round produces 192 bits of key material, or 1.5 round keys.
804 ## Therefore, on each cycle we do 2 rounds and produce 3 round
809 li r0, 4 # mov \$4, %esi
811 vperm v0, v6, v0, $inpperm
812 vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
813 bl _vpaes_schedule_transform # input transform
815 vsldoi v6, v9, v6, 8 # clobber "low" side with zeros
819 bl _vpaes_schedule_round
820 vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0
821 bl _vpaes_schedule_mangle # save key n
822 bl _vpaes_schedule_192_smear
823 bl _vpaes_schedule_mangle # save key n+1
824 bl _vpaes_schedule_round
825 bdz Lschedule_mangle_last # dec %esi
826 bl _vpaes_schedule_mangle # save key n+2
827 bl _vpaes_schedule_192_smear
833 ## 256-bit specific part of key schedule.
835 ## The structure here is very similar to the 128-bit
836 ## schedule, but with an additional "low side" in
837 ## %xmm6. The low side's rounds are the same as the
838 ## high side's, except no rcon and no rotation.
842 li r0, 7 # mov \$7, %esi
844 lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
845 vperm v0, v6, v0, $inpperm
846 bl _vpaes_schedule_transform # input transform
850 bl _vpaes_schedule_mangle # output low result
851 vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
854 bl _vpaes_schedule_round
855 bdz Lschedule_mangle_last # dec %esi
856 bl _vpaes_schedule_mangle
858 # low round. swap xmm7 and xmm6
859 vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
860 vmr v5, v7 # vmovdqa %xmm7, %xmm5
861 vmr v7, v6 # vmovdqa %xmm6, %xmm7
862 bl _vpaes_schedule_low_round
863 vmr v7, v5 # vmovdqa %xmm5, %xmm7
867 ## .aes_schedule_mangle_last
869 ## Mangler for last round of key schedule
871 ## when encrypting, outputs out(%xmm0) ^ 63
872 ## when decrypting, outputs unskew(%xmm0)
874 ## Always called right before return... jumps to cleanup and exits
877 Lschedule_mangle_last:
878 # schedule last round key from xmm0
879 li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11
881 bne $dir, Lschedule_mangle_last_dec
884 lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1
885 li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform
886 li r9, 0x2d0 # prepare to output transform
887 vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute
889 lvx $iptlo, r11, r12 # reload $ipt
891 addi $out, $out, 16 # add \$16, %rdx
892 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
893 bl _vpaes_schedule_transform # output transform
895 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
896 vperm v0, v0, v0, $outperm # rotate left
897 vsel v2, $outhead, v0, $outmask
901 addi $out, $out, 15 # 15 is not typo
902 lvx v1, 0, $out # redundant in aligned case
903 vsel v1, $outhead, v1, $outmask
905 b Lschedule_mangle_done
908 Lschedule_mangle_last_dec:
909 lvx $iptlo, r11, r12 # reload $ipt
911 addi $out, $out, -16 # add \$-16, %rdx
912 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
913 bl _vpaes_schedule_transform # output transform
915 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
916 vperm v0, v0, v0, $outperm # rotate left
917 vsel v2, $outhead, v0, $outmask
921 addi $out, $out, -15 # -15 is not typo
922 lvx v1, 0, $out # redundant in aligned case
923 vsel v1, $outhead, v1, $outmask
926 Lschedule_mangle_done:
929 vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0
930 vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1
931 vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2
932 vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3
933 vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
934 vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5
935 vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6
936 vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7
940 .byte 0,12,0x14,0,0,0,0,0
943 ## .aes_schedule_192_smear
945 ## Smear the short, low side in the 192-bit key schedule.
948 ## %xmm7: high side, b a x y
949 ## %xmm6: low side, d c 0 0
953 ## %xmm6: b+c+d b+c 0 0
954 ## %xmm0: b+c+d b+c b a
957 _vpaes_schedule_192_smear:
959 vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
960 vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
961 vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
962 vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
965 vsldoi v6, v9, v6, 8 # clobber low side with zeros
968 .byte 0,12,0x14,0,0,0,0,0
971 ## .aes_schedule_round
973 ## Runs one main round of the key schedule on %xmm0, %xmm7
975 ## Specifically, runs subbytes on the high dword of %xmm0
976 ## then rotates it by one byte and xors into the low dword of
979 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
982 ## Smears the dwords of %xmm7 by xoring the low into the
983 ## second low, result into third, result into highest.
985 ## Returns results in %xmm7 = %xmm0.
986 ## Clobbers %xmm1-%xmm4, %r11.
989 _vpaes_schedule_round:
990 # extract rcon from xmm8
991 #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
992 vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1
993 vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8
994 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
997 vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
998 vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0
1002 # low round: same as high round, but no rotation and no rcon.
1003 _vpaes_schedule_low_round:
1005 vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1
1006 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1007 vspltisb v1, 0x0f # 0x0f..0f
1008 vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4
1011 vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k
1012 vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
1013 vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7
1014 vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
1015 vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
1016 vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
1017 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
1018 vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
1019 vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7
1020 vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
1021 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
1022 vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
1023 vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io
1024 vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
1025 vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
1026 vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
1027 vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
1029 # add in smeared stuff
1030 vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0
1031 vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7
1034 .byte 0,12,0x14,0,0,0,0,0
1037 ## .aes_schedule_transform
1039 ## Linear-transform %xmm0 according to tables at (%r11)
1041 ## Requires that %xmm9 = 0x0F0F... as in preheat
1046 _vpaes_schedule_transform:
1047 #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1
1048 vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
1049 # vmovdqa (%r11), %xmm2 # lo
1050 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
1051 # vmovdqa 16(%r11), %xmm1 # hi
1052 vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0
1053 vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0
1056 .byte 0,12,0x14,0,0,0,0,0
1059 ## .aes_schedule_mangle
1061 ## Mangle xmm0 from (basis-transformed) standard version
1066 ## multiply by circulant 0,1,1,1
1067 ## apply shiftrows transform
1071 ## multiply by "inverse mixcolumns" circulant E,B,D,9
1073 ## apply shiftrows transform
1076 ## Writes out to (%rdx), and increments or decrements it
1077 ## Keeps track of round number mod 4 in %r8
1079 ## Clobbers xmm1-xmm5
1082 _vpaes_schedule_mangle:
1083 #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later
1084 # vmovdqa .Lk_mc_forward(%rip),%xmm5
1085 bne $dir, Lschedule_mangle_dec
1088 vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4
1089 addi $out, $out, 16 # add \$16, %rdx
1090 vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4
1091 vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1
1092 vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3
1093 vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4
1094 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1095 vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3
1097 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1098 addi r8, r8, -16 # add \$-16, %r8
1099 andi. r8, r8, 0x30 # and \$0x30, %r8
1101 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1102 vperm v1, v3, v3, $outperm # rotate left
1103 vsel v2, $outhead, v1, $outmask
1109 Lschedule_mangle_dec:
1110 # inverse mix columns
1111 # lea .Lk_dksd(%rip),%r11
1112 vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
1113 #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo
1115 # vmovdqa 0x00(%r11), %xmm2
1116 vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2
1117 # vmovdqa 0x10(%r11), %xmm3
1118 vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3
1119 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1120 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1122 # vmovdqa 0x20(%r11), %xmm2
1123 vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2
1124 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1125 # vmovdqa 0x30(%r11), %xmm3
1126 vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3
1127 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1128 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1130 # vmovdqa 0x40(%r11), %xmm2
1131 vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2
1132 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1133 # vmovdqa 0x50(%r11), %xmm3
1134 vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3
1135 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1137 # vmovdqa 0x60(%r11), %xmm2
1138 vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2
1139 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1140 # vmovdqa 0x70(%r11), %xmm4
1141 vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4
1142 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1143 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1144 vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3
1146 addi $out, $out, -16 # add \$-16, %rdx
1148 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1149 addi r8, r8, -16 # add \$-16, %r8
1150 andi. r8, r8, 0x30 # and \$0x30, %r8
1152 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1153 vperm v1, v3, v3, $outperm # rotate left
1154 vsel v2, $outhead, v1, $outmask
1159 .byte 0,12,0x14,0,0,0,0,0
1161 .globl .vpaes_set_encrypt_key
1163 .vpaes_set_encrypt_key:
1165 mfspr r6, 256 # save vrsave
1167 $PUSH r0, $LRSAVE($sp)
1168 mtspr 256, r7 # preserve all AltiVec registers
1170 srwi r9, $bits, 5 # shr \$5,%eax
1171 addi r9, r9, 6 # add \$5,%eax
1172 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1174 cmplw $dir, $bits, $bits
1175 li r8, 0x30 # mov \$0x30,%r8d
1176 bl _vpaes_schedule_core
1178 $POP r0, $LRSAVE($sp)
1179 mtspr 256, r6 # restore vrsave
1184 .byte 0,12,0x14,1,0,3,0
1186 .size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key
1188 .globl .vpaes_set_decrypt_key
1190 .vpaes_set_decrypt_key:
1192 mfspr r6, 256 # save vrsave
1194 $PUSH r0, $LRSAVE($sp)
1195 mtspr 256, r7 # preserve all AltiVec registers
1197 srwi r9, $bits, 5 # shr \$5,%eax
1198 addi r9, r9, 6 # add \$5,%eax
1199 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1201 slwi r9, r9, 4 # shl \$4,%eax
1202 add $out, $out, r9 # lea (%rdx,%rax),%rdx
1204 cmplwi $dir, $bits, 0
1205 srwi r8, $bits, 1 # shr \$1,%r8d
1206 andi. r8, r8, 32 # and \$32,%r8d
1207 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
1208 bl _vpaes_schedule_core
1210 $POP r0, $LRSAVE($sp)
1211 mtspr 256, r6 # restore vrsave
1216 .byte 0,12,0x14,1,0,3,0
1218 .size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key