2 # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ######################################################################
11 ## Constant-time SSSE3 AES core implementation.
14 ## By Mike Hamburg (Stanford University), 2009
17 ## For details see http://shiftleft.org/papers/vector_aes/ and
18 ## http://crypto.stanford.edu/vpaes/.
20 ######################################################################
21 # ARMv8 NEON adaptation by <appro@openssl.org>
23 # Reason for undertaken effort is that there is at least one popular
24 # SoC based on Cortex-A53 that doesn't have crypto extensions.
26 # CBC enc ECB enc/dec(*) [bit-sliced enc/dec]
27 # Cortex-A53 21.5 18.1/20.6 [17.5/19.8 ]
28 # Cortex-A57 36.0(**) 20.4/24.9(**) [14.4/16.6 ]
29 # X-Gene 45.9(**) 45.8/57.7(**) [33.1/37.6(**) ]
30 # Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ]
31 # Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ]
32 # Mongoose(***) 26.3(**) 21.0/25.0(**) [13.3/16.8 ]
34 # (*) ECB denotes approximate result for parallelizable modes
35 # such as CBC decrypt, CTR, etc.;
36 # (**) these results are worse than scalar compiler-generated
37 # code, but it's constant-time and therefore preferred;
38 # (***) presented for reference/comparison purposes;
41 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
43 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
44 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
45 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
46 die "can't locate arm-xlate.pl";
48 open OUT,"| \"$^X\" $xlate $flavour $output";
54 .type _vpaes_consts,%object
55 .align 7 // totally strategic alignment
57 .Lk_mc_forward: // mc_forward
58 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
59 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
60 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
61 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
62 .Lk_mc_backward:// mc_backward
63 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
64 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
65 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
66 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
68 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
69 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
70 .quad 0x0F060D040B020900, 0x070E050C030A0108
71 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
77 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
78 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
79 .Lk_ipt: // input transform (lo, hi)
80 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
81 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
82 .Lk_sbo: // sbou, sbot
83 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
84 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
85 .Lk_sb1: // sb1u, sb1t
86 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
87 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
88 .Lk_sb2: // sb2u, sb2t
89 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
90 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
95 .Lk_dipt: // decryption input transform
96 .quad 0x0F505B040B545F00, 0x154A411E114E451A
97 .quad 0x86E383E660056500, 0x12771772F491F194
98 .Lk_dsbo: // decryption sbox final output
99 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
100 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
101 .Lk_dsb9: // decryption sbox output *9*u, *9*t
102 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
103 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
104 .Lk_dsbd: // decryption sbox output *D*u, *D*t
105 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
106 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
107 .Lk_dsbb: // decryption sbox output *B*u, *B*t
108 .quad 0xD022649296B44200, 0x602646F6B0F2D404
109 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
110 .Lk_dsbe: // decryption sbox output *E*u, *E*t
111 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
112 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
115 // Key schedule constants
117 .Lk_dksd: // decryption key schedule: invskew x*D
118 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
119 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
120 .Lk_dksb: // decryption key schedule: invskew x*B
121 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
122 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
123 .Lk_dkse: // decryption key schedule: invskew x*E + 0x63
124 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
125 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
126 .Lk_dks9: // decryption key schedule: invskew x*9
127 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
128 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
131 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
133 .Lk_opt: // output transform
134 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
135 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
136 .Lk_deskew: // deskew tables: inverts the sbox's "skew"
137 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
138 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
140 .asciz "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)"
141 .size _vpaes_consts,.-_vpaes_consts
146 my ($inp,$out,$key) = map("x$_",(0..2));
148 my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23));
149 my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
150 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));
156 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
157 ## and %xmm9-%xmm15 as specified below.
159 .type _vpaes_encrypt_preheat,%function
161 _vpaes_encrypt_preheat:
164 ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv
165 ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo
166 ld1 {v24.2d-v27.2d}, [x10] // .Lk_sb1, .Lk_sb2
168 .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
173 ## AES-encrypt %xmm0.
177 ## %xmm9-%xmm15 as in _vpaes_preheat
178 ## (%rdx) = scheduled keys
181 ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
182 ## Preserves %xmm6 - %xmm8 so you get some local vectors
185 .type _vpaes_encrypt_core,%function
189 ldr w8, [$key,#240] // pull rounds
190 adr x11, .Lk_mc_forward+16
191 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
192 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
193 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
194 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
195 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
196 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
197 tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
198 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
199 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
204 // middle of middle round
206 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
207 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
208 tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
209 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
210 tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
211 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
212 tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
213 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
214 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
215 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
216 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
217 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
218 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
219 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
220 and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4
221 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
222 sub w8, w8, #1 // nr--
226 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
227 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
228 tbl v5.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
229 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
230 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
231 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
232 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
233 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
234 tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
235 tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
236 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
237 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
238 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
241 // middle of last round
243 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
244 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
245 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
246 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
247 tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
248 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
249 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
250 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
252 .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
255 .type vpaes_encrypt,%function
258 .inst 0xd503233f // paciasp
259 stp x29,x30,[sp,#-16]!
263 bl _vpaes_encrypt_preheat
264 bl _vpaes_encrypt_core
268 .inst 0xd50323bf // autiasp
270 .size vpaes_encrypt,.-vpaes_encrypt
272 .type _vpaes_encrypt_2x,%function
276 ldr w8, [$key,#240] // pull rounds
277 adr x11, .Lk_mc_forward+16
278 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
279 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
280 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
281 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
282 and v9.16b, v15.16b, v17.16b
283 ushr v8.16b, v15.16b, #4
284 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
285 tbl v9.16b, {$iptlo}, v9.16b
286 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
287 tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
288 tbl v10.16b, {$ipthi}, v8.16b
289 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
290 eor v8.16b, v9.16b, v16.16b
291 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
292 eor v8.16b, v8.16b, v10.16b
297 // middle of middle round
299 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
300 tbl v12.16b, {$sb1t}, v10.16b
301 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
302 tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
303 tbl v8.16b, {$sb1u}, v11.16b
304 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
305 eor v12.16b, v12.16b, v16.16b
306 tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
307 tbl v13.16b, {$sb2t}, v10.16b
308 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
309 eor v8.16b, v8.16b, v12.16b
310 tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
311 tbl v10.16b, {$sb2u}, v11.16b
312 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
313 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
314 tbl v11.16b, {v8.16b}, v1.16b
315 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
316 eor v10.16b, v10.16b, v13.16b
317 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
318 tbl v8.16b, {v8.16b}, v4.16b
319 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
320 eor v11.16b, v11.16b, v10.16b
321 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
322 tbl v12.16b, {v11.16b},v1.16b
323 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
324 eor v8.16b, v8.16b, v11.16b
325 and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4
326 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
327 eor v8.16b, v8.16b, v12.16b
328 sub w8, w8, #1 // nr--
332 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
333 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
334 and v9.16b, v8.16b, v17.16b
335 ushr v8.16b, v8.16b, #4
336 tbl v5.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
337 tbl v13.16b, {$invhi},v9.16b
338 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
339 eor v9.16b, v9.16b, v8.16b
340 tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
341 tbl v11.16b, {$invlo},v8.16b
342 tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
343 tbl v12.16b, {$invlo},v9.16b
344 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
345 eor v11.16b, v11.16b, v13.16b
346 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
347 eor v12.16b, v12.16b, v13.16b
348 tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
349 tbl v10.16b, {$invlo},v11.16b
350 tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
351 tbl v11.16b, {$invlo},v12.16b
352 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
353 eor v10.16b, v10.16b, v9.16b
354 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
355 eor v11.16b, v11.16b, v8.16b
356 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
357 cbnz w8, .Lenc_2x_loop
359 // middle of last round
361 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
362 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
363 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
364 tbl v12.16b, {$sbou}, v10.16b
365 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
366 tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
367 tbl v8.16b, {$sbot}, v11.16b
368 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
369 eor v12.16b, v12.16b, v16.16b
370 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
371 eor v8.16b, v8.16b, v12.16b
372 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
373 tbl v1.16b, {v8.16b},v1.16b
375 .size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x
377 .type _vpaes_decrypt_preheat,%function
379 _vpaes_decrypt_preheat:
383 ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv
384 ld1 {v20.2d-v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo
385 ld1 {v24.2d-v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd
386 ld1 {v28.2d-v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe
388 .size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
393 ## Same API as encryption core.
395 .type _vpaes_decrypt_core,%function
399 ldr w8, [$key,#240] // pull rounds
401 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
402 lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11
403 eor x11, x11, #0x30 // xor \$0x30, %r11
405 and x11, x11, #0x30 // and \$0x30, %r11
407 adr x10, .Lk_mc_forward+48
409 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
410 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
411 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
412 tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
413 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
414 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
415 tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
416 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
417 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
423 // Inverse mix columns
425 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
426 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
427 tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
428 tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
429 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
430 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
431 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
432 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
434 tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
435 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
436 tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
437 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
438 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
439 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
440 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
442 tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
443 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
444 tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
445 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
446 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
447 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
448 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
450 tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
451 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
452 tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
453 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
454 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5
455 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
456 sub w8, w8, #1 // sub \$1,%rax # nr--
460 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
461 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
462 tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
463 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
464 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
465 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
466 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
467 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
468 tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
469 tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
470 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
471 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
472 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
475 // middle of last round
476 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
477 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
478 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
479 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
480 tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
481 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
482 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
483 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
485 .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
488 .type vpaes_decrypt,%function
491 .inst 0xd503233f // paciasp
492 stp x29,x30,[sp,#-16]!
496 bl _vpaes_decrypt_preheat
497 bl _vpaes_decrypt_core
501 .inst 0xd50323bf // autiasp
503 .size vpaes_decrypt,.-vpaes_decrypt
505 // v14-v15 input, v0-v1 output
506 .type _vpaes_decrypt_2x,%function
510 ldr w8, [$key,#240] // pull rounds
512 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
513 lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11
514 eor x11, x11, #0x30 // xor \$0x30, %r11
516 and x11, x11, #0x30 // and \$0x30, %r11
518 adr x10, .Lk_mc_forward+48
520 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
521 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
522 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
523 and v9.16b, v15.16b, v17.16b
524 ushr v8.16b, v15.16b, #4
525 tbl v2.16b, {$iptlo},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
526 tbl v10.16b, {$iptlo},v9.16b
527 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
528 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
529 tbl v0.16b, {$ipthi},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
530 tbl v8.16b, {$ipthi},v8.16b
531 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
532 eor v10.16b, v10.16b, v16.16b
533 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
534 eor v8.16b, v8.16b, v10.16b
540 // Inverse mix columns
542 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
543 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
544 tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
545 tbl v12.16b, {$sb9u}, v10.16b
546 tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
547 tbl v9.16b, {$sb9t}, v11.16b
548 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
549 eor v8.16b, v12.16b, v16.16b
550 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
551 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
552 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
553 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
555 tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
556 tbl v12.16b, {$sbdu}, v10.16b
557 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
558 tbl v8.16b, {v8.16b},v5.16b
559 tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
560 tbl v9.16b, {$sbdt}, v11.16b
561 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
562 eor v8.16b, v8.16b, v12.16b
563 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
564 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
565 eor v8.16b, v8.16b, v9.16b
566 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
568 tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
569 tbl v12.16b, {$sbbu}, v10.16b
570 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
571 tbl v8.16b, {v8.16b},v5.16b
572 tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
573 tbl v9.16b, {$sbbt}, v11.16b
574 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
575 eor v8.16b, v8.16b, v12.16b
576 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
577 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
578 eor v8.16b, v8.16b, v9.16b
579 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
581 tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
582 tbl v12.16b, {$sbeu}, v10.16b
583 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
584 tbl v8.16b, {v8.16b},v5.16b
585 tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
586 tbl v9.16b, {$sbet}, v11.16b
587 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
588 eor v8.16b, v8.16b, v12.16b
589 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5
590 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
591 eor v8.16b, v8.16b, v9.16b
592 sub w8, w8, #1 // sub \$1,%rax # nr--
596 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
597 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
598 and v9.16b, v8.16b, v17.16b
599 ushr v8.16b, v8.16b, #4
600 tbl v2.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
601 tbl v10.16b, {$invhi},v9.16b
602 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
603 eor v9.16b, v9.16b, v8.16b
604 tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
605 tbl v11.16b, {$invlo},v8.16b
606 tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
607 tbl v12.16b, {$invlo},v9.16b
608 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
609 eor v11.16b, v11.16b, v10.16b
610 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
611 eor v12.16b, v12.16b, v10.16b
612 tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
613 tbl v10.16b, {$invlo},v11.16b
614 tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
615 tbl v11.16b, {$invlo},v12.16b
616 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
617 eor v10.16b, v10.16b, v9.16b
618 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
619 eor v11.16b, v11.16b, v8.16b
620 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
621 cbnz w8, .Ldec_2x_loop
623 // middle of last round
624 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
625 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
626 tbl v12.16b, {$sbou}, v10.16b
627 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
628 tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
629 tbl v9.16b, {$sbot}, v11.16b
630 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
631 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
632 eor v12.16b, v12.16b, v16.16b
633 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
634 eor v8.16b, v9.16b, v12.16b
635 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
636 tbl v1.16b, {v8.16b},v2.16b
638 .size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x
642 my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
643 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));
646 ########################################################
648 ## AES key schedule ##
650 ########################################################
651 .type _vpaes_key_preheat,%function
655 movi v16.16b, #0x5b // .Lk_s63
657 movi v17.16b, #0x0f // .Lk_s0F
658 ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt
660 ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1
661 adr x11, .Lk_mc_forward
662 ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb
663 ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9
664 ld1 {v8.2d}, [x10] // .Lk_rcon
665 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0]
667 .size _vpaes_key_preheat,.-_vpaes_key_preheat
669 .type _vpaes_schedule_core,%function
671 _vpaes_schedule_core:
672 .inst 0xd503233f // paciasp
673 stp x29, x30, [sp,#-16]!
676 bl _vpaes_key_preheat // load the tables
678 ld1 {v0.16b}, [$inp],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
681 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
682 bl _vpaes_schedule_transform
683 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
685 adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10
687 cbnz $dir, .Lschedule_am_decrypting
689 // encrypting, output zeroth round key after transform
690 st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx)
693 .Lschedule_am_decrypting:
694 // decrypting, output zeroth round key after shiftrows
695 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
696 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
697 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx)
698 eor x8, x8, #0x30 // xor \$0x30, %r8
701 cmp $bits, #192 // cmp \$192, %esi
709 ## 128-bit specific part of key schedule.
711 ## This schedule is really simple, because all its parts
712 ## are accomplished by the subroutines.
715 mov $inp, #10 // mov \$10, %esi
718 sub $inp, $inp, #1 // dec %esi
719 bl _vpaes_schedule_round
720 cbz $inp, .Lschedule_mangle_last
721 bl _vpaes_schedule_mangle // write output
727 ## 192-bit specific part of key schedule.
729 ## The main body of this schedule is the same as the 128-bit
730 ## schedule, but with more smearing. The long, high side is
731 ## stored in %xmm7 as before, and the short, low side is in
732 ## the high bits of %xmm6.
734 ## This schedule is somewhat nastier, however, because each
735 ## round produces 192 bits of key material, or 1.5 round keys.
736 ## Therefore, on each cycle we do 2 rounds and produce 3 round
742 ld1 {v0.16b}, [$inp] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
743 bl _vpaes_schedule_transform // input transform
744 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
745 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
746 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
747 mov $inp, #4 // mov \$4, %esi
750 sub $inp, $inp, #1 // dec %esi
751 bl _vpaes_schedule_round
752 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr \$8,%xmm6,%xmm0,%xmm0
753 bl _vpaes_schedule_mangle // save key n
754 bl _vpaes_schedule_192_smear
755 bl _vpaes_schedule_mangle // save key n+1
756 bl _vpaes_schedule_round
757 cbz $inp, .Lschedule_mangle_last
758 bl _vpaes_schedule_mangle // save key n+2
759 bl _vpaes_schedule_192_smear
765 ## 256-bit specific part of key schedule.
767 ## The structure here is very similar to the 128-bit
768 ## schedule, but with an additional "low side" in
769 ## %xmm6. The low side's rounds are the same as the
770 ## high side's, except no rcon and no rotation.
774 ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
775 bl _vpaes_schedule_transform // input transform
776 mov $inp, #7 // mov \$7, %esi
779 sub $inp, $inp, #1 // dec %esi
780 bl _vpaes_schedule_mangle // output low result
781 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
784 bl _vpaes_schedule_round
785 cbz $inp, .Lschedule_mangle_last
786 bl _vpaes_schedule_mangle
788 // low round. swap xmm7 and xmm6
789 dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
791 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
792 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
793 bl _vpaes_schedule_low_round
794 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
799 ## .aes_schedule_mangle_last
801 ## Mangler for last round of key schedule
803 ## when encrypting, outputs out(%xmm0) ^ 63
804 ## when decrypting, outputs unskew(%xmm0)
806 ## Always called right before return... jumps to cleanup and exits
809 .Lschedule_mangle_last:
810 // schedule last round key from xmm0
811 adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew
812 cbnz $dir, .Lschedule_mangle_last_dec
815 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
816 adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform
817 add $out, $out, #32 // add \$32, %rdx
818 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
820 .Lschedule_mangle_last_dec:
821 ld1 {v20.2d-v21.2d}, [x11] // reload constants
822 sub $out, $out, #16 // add \$-16, %rdx
823 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
824 bl _vpaes_schedule_transform // output transform
825 st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key
828 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
829 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
830 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
831 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
832 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
833 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
834 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
835 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
836 ldp x29, x30, [sp],#16
837 .inst 0xd50323bf // autiasp
839 .size _vpaes_schedule_core,.-_vpaes_schedule_core
842 ## .aes_schedule_192_smear
844 ## Smear the short, low side in the 192-bit key schedule.
847 ## %xmm7: high side, b a x y
848 ## %xmm6: low side, d c 0 0
852 ## %xmm6: b+c+d b+c 0 0
853 ## %xmm0: b+c+d b+c b a
855 .type _vpaes_schedule_192_smear,%function
857 _vpaes_schedule_192_smear:
860 ins v1.s[3], v6.s[2] // vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
861 ins v0.s[0], v7.s[2] // vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
862 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
863 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
864 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
865 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
866 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
868 .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
871 ## .aes_schedule_round
873 ## Runs one main round of the key schedule on %xmm0, %xmm7
875 ## Specifically, runs subbytes on the high dword of %xmm0
876 ## then rotates it by one byte and xors into the low dword of
879 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
882 ## Smears the dwords of %xmm7 by xoring the low into the
883 ## second low, result into third, result into highest.
885 ## Returns results in %xmm7 = %xmm0.
886 ## Clobbers %xmm1-%xmm4, %r11.
888 .type _vpaes_schedule_round,%function
890 _vpaes_schedule_round:
891 // extract rcon from xmm8
892 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
893 ext v1.16b, $rcon, v4.16b, #15 // vpalignr \$15, %xmm8, %xmm4, %xmm1
894 ext $rcon, $rcon, $rcon, #15 // vpalignr \$15, %xmm8, %xmm8, %xmm8
895 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
898 dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
899 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr \$1, %xmm0, %xmm0, %xmm0
903 // low round: same as high round, but no rotation and no rcon.
904 _vpaes_schedule_low_round:
906 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq \$4, %xmm7, %xmm1
907 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
908 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq \$8, %xmm7, %xmm4
911 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
912 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
913 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
914 tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
915 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
916 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
917 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
918 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
919 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7
920 tbl v3.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
921 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
922 tbl v2.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
923 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
924 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
925 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
926 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
927 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
929 // add in smeared stuff
930 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
931 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
933 .size _vpaes_schedule_round,.-_vpaes_schedule_round
936 ## .aes_schedule_transform
938 ## Linear-transform %xmm0 according to tables at (%r11)
940 ## Requires that %xmm9 = 0x0F0F... as in preheat
942 ## Clobbers %xmm1, %xmm2
944 .type _vpaes_schedule_transform,%function
946 _vpaes_schedule_transform:
947 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
948 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
949 // vmovdqa (%r11), %xmm2 # lo
950 tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
951 // vmovdqa 16(%r11), %xmm1 # hi
952 tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
953 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
955 .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
958 ## .aes_schedule_mangle
960 ## Mangle xmm0 from (basis-transformed) standard version
965 ## multiply by circulant 0,1,1,1
966 ## apply shiftrows transform
970 ## multiply by "inverse mixcolumns" circulant E,B,D,9
972 ## apply shiftrows transform
975 ## Writes out to (%rdx), and increments or decrements it
976 ## Keeps track of round number mod 4 in %r8
978 ## Clobbers xmm1-xmm5
980 .type _vpaes_schedule_mangle,%function
982 _vpaes_schedule_mangle:
983 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
984 // vmovdqa .Lk_mc_forward(%rip),%xmm5
985 cbnz $dir, .Lschedule_mangle_dec
988 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4
989 add $out, $out, #16 // add \$16, %rdx
990 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
991 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
992 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
993 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
994 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
995 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
997 b .Lschedule_mangle_both
999 .Lschedule_mangle_dec:
1000 // inverse mix columns
1001 // lea .Lk_dksd(%rip),%r11
1002 ushr v1.16b, v4.16b, #4 // vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
1003 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
1005 // vmovdqa 0x00(%r11), %xmm2
1006 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1007 // vmovdqa 0x10(%r11), %xmm3
1008 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1009 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1010 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1012 // vmovdqa 0x20(%r11), %xmm2
1013 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1014 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1015 // vmovdqa 0x30(%r11), %xmm3
1016 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1017 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1018 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1020 // vmovdqa 0x40(%r11), %xmm2
1021 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1022 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1023 // vmovdqa 0x50(%r11), %xmm3
1024 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1025 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1027 // vmovdqa 0x60(%r11), %xmm2
1028 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1029 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1030 // vmovdqa 0x70(%r11), %xmm4
1031 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
1032 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
1033 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1034 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
1036 sub $out, $out, #16 // add \$-16, %rdx
1038 .Lschedule_mangle_both:
1039 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1040 add x8, x8, #64-16 // add \$-16, %r8
1041 and x8, x8, #~(1<<6) // and \$0x30, %r8
1042 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx)
1044 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
1046 .globl vpaes_set_encrypt_key
1047 .type vpaes_set_encrypt_key,%function
1049 vpaes_set_encrypt_key:
1050 .inst 0xd503233f // paciasp
1051 stp x29,x30,[sp,#-16]!
1053 stp d8,d9,[sp,#-16]! // ABI spec says so
1055 lsr w9, $bits, #5 // shr \$5,%eax
1056 add w9, w9, #5 // \$5,%eax
1057 str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1059 mov $dir, #0 // mov \$0,%ecx
1060 mov x8, #0x30 // mov \$0x30,%r8d
1061 bl _vpaes_schedule_core
1065 ldp x29,x30,[sp],#16
1066 .inst 0xd50323bf // autiasp
1068 .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
1070 .globl vpaes_set_decrypt_key
1071 .type vpaes_set_decrypt_key,%function
1073 vpaes_set_decrypt_key:
1074 .inst 0xd503233f // paciasp
1075 stp x29,x30,[sp,#-16]!
1077 stp d8,d9,[sp,#-16]! // ABI spec says so
1079 lsr w9, $bits, #5 // shr \$5,%eax
1080 add w9, w9, #5 // \$5,%eax
1081 str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1082 lsl w9, w9, #4 // shl \$4,%eax
1083 add $out, $out, #16 // lea 16(%rdx,%rax),%rdx
1086 mov $dir, #1 // mov \$1,%ecx
1087 lsr w8, $bits, #1 // shr \$1,%r8d
1088 and x8, x8, #32 // and \$32,%r8d
1089 eor x8, x8, #32 // xor \$32,%r8d # nbits==192?0:32
1090 bl _vpaes_schedule_core
1093 ldp x29,x30,[sp],#16
1094 .inst 0xd50323bf // autiasp
1096 .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
1100 my ($inp,$out,$len,$key,$ivec,$dir) = map("x$_",(0..5));
1103 .globl vpaes_cbc_encrypt
1104 .type vpaes_cbc_encrypt,%function
1107 cbz $len, .Lcbc_abort
1108 cmp w5, #0 // check direction
1109 b.eq vpaes_cbc_decrypt
1111 .inst 0xd503233f // paciasp
1112 stp x29,x30,[sp,#-16]!
1115 mov x17, $len // reassign
1116 mov x2, $key // reassign
1118 ld1 {v0.16b}, [$ivec] // load ivec
1119 bl _vpaes_encrypt_preheat
1124 ld1 {v7.16b}, [$inp],#16 // load input
1125 eor v7.16b, v7.16b, v0.16b // xor with ivec
1126 bl _vpaes_encrypt_core
1127 st1 {v0.16b}, [$out],#16 // save output
1131 st1 {v0.16b}, [$ivec] // write ivec
1133 ldp x29,x30,[sp],#16
1134 .inst 0xd50323bf // autiasp
1137 .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
1139 .type vpaes_cbc_decrypt,%function
1142 .inst 0xd503233f // paciasp
1143 stp x29,x30,[sp,#-16]!
1145 stp d8,d9,[sp,#-16]! // ABI spec says so
1146 stp d10,d11,[sp,#-16]!
1147 stp d12,d13,[sp,#-16]!
1148 stp d14,d15,[sp,#-16]!
1150 mov x17, $len // reassign
1151 mov x2, $key // reassign
1152 ld1 {v6.16b}, [$ivec] // load ivec
1153 bl _vpaes_decrypt_preheat
1155 b.eq .Lcbc_dec_loop2x
1157 ld1 {v7.16b}, [$inp], #16 // load input
1158 bl _vpaes_decrypt_core
1159 eor v0.16b, v0.16b, v6.16b // xor with ivec
1160 orr v6.16b, v7.16b, v7.16b // next ivec value
1161 st1 {v0.16b}, [$out], #16
1167 ld1 {v14.16b,v15.16b}, [$inp], #32
1168 bl _vpaes_decrypt_2x
1169 eor v0.16b, v0.16b, v6.16b // xor with ivec
1170 eor v1.16b, v1.16b, v14.16b
1171 orr v6.16b, v15.16b, v15.16b
1172 st1 {v0.16b,v1.16b}, [$out], #32
1174 b.hi .Lcbc_dec_loop2x
1177 st1 {v6.16b}, [$ivec]
1179 ldp d14,d15,[sp],#16
1180 ldp d12,d13,[sp],#16
1181 ldp d10,d11,[sp],#16
1183 ldp x29,x30,[sp],#16
1184 .inst 0xd50323bf // autiasp
1186 .size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
1190 .globl vpaes_ecb_encrypt
1191 .type vpaes_ecb_encrypt,%function
1194 .inst 0xd503233f // paciasp
1195 stp x29,x30,[sp,#-16]!
1197 stp d8,d9,[sp,#-16]! // ABI spec says so
1198 stp d10,d11,[sp,#-16]!
1199 stp d12,d13,[sp,#-16]!
1200 stp d14,d15,[sp,#-16]!
1204 bl _vpaes_encrypt_preheat
1208 ld1 {v7.16b}, [$inp],#16
1209 bl _vpaes_encrypt_core
1210 st1 {v0.16b}, [$out],#16
1216 ld1 {v14.16b,v15.16b}, [$inp], #32
1217 bl _vpaes_encrypt_2x
1218 st1 {v0.16b,v1.16b}, [$out], #32
1223 ldp d14,d15,[sp],#16
1224 ldp d12,d13,[sp],#16
1225 ldp d10,d11,[sp],#16
1227 ldp x29,x30,[sp],#16
1228 .inst 0xd50323bf // autiasp
1230 .size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
1232 .globl vpaes_ecb_decrypt
1233 .type vpaes_ecb_decrypt,%function
1236 .inst 0xd503233f // paciasp
1237 stp x29,x30,[sp,#-16]!
1239 stp d8,d9,[sp,#-16]! // ABI spec says so
1240 stp d10,d11,[sp,#-16]!
1241 stp d12,d13,[sp,#-16]!
1242 stp d14,d15,[sp,#-16]!
1246 bl _vpaes_decrypt_preheat
1250 ld1 {v7.16b}, [$inp],#16
1251 bl _vpaes_encrypt_core
1252 st1 {v0.16b}, [$out],#16
1258 ld1 {v14.16b,v15.16b}, [$inp], #32
1259 bl _vpaes_decrypt_2x
1260 st1 {v0.16b,v1.16b}, [$out], #32
1265 ldp d14,d15,[sp],#16
1266 ldp d12,d13,[sp],#16
1267 ldp d10,d11,[sp],#16
1269 ldp x29,x30,[sp],#16
1270 .inst 0xd50323bf // autiasp
1272 .size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt