2 # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ######################################################################
11 ## Constant-time SSSE3 AES core implementation.
14 ## By Mike Hamburg (Stanford University), 2009
17 ## For details see http://shiftleft.org/papers/vector_aes/ and
18 ## http://crypto.stanford.edu/vpaes/.
20 ######################################################################
21 # ARMv8 NEON adaptation by <appro@openssl.org>
23 # Reason for undertaken effort is that there is at least one popular
24 # SoC based on Cortex-A53 that doesn't have crypto extensions.
26 # CBC enc ECB enc/dec(*) [bit-sliced enc/dec]
27 # Cortex-A53 21.5 18.1/20.6 [17.5/19.8 ]
28 # Cortex-A57 36.0(**) 20.4/24.9(**) [14.4/16.6 ]
29 # X-Gene 45.9(**) 45.8/57.7(**) [33.1/37.6(**) ]
30 # Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ]
31 # Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ]
32 # Mongoose(***) 26.3(**) 21.0/25.0(**) [13.3/16.8 ]
34 # (*) ECB denotes approximate result for parallelizable modes
35 # such as CBC decrypt, CTR, etc.;
36 # (**) these results are worse than scalar compiler-generated
37 # code, but it's constant-time and therefore preferred;
38 # (***) presented for reference/comparison purposes;
41 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
43 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
44 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
45 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
46 die "can't locate arm-xlate.pl";
48 open OUT,"| \"$^X\" $xlate $flavour $output";
54 .type _vpaes_consts,%object
55 .align 7 // totally strategic alignment
57 .Lk_mc_forward: // mc_forward
58 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
59 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
60 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
61 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
62 .Lk_mc_backward:// mc_backward
63 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
64 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
65 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
66 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
68 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
69 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
70 .quad 0x0F060D040B020900, 0x070E050C030A0108
71 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
77 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
78 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
79 .Lk_ipt: // input transform (lo, hi)
80 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
81 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
82 .Lk_sbo: // sbou, sbot
83 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
84 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
85 .Lk_sb1: // sb1u, sb1t
86 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
87 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
88 .Lk_sb2: // sb2u, sb2t
89 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
90 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
95 .Lk_dipt: // decryption input transform
96 .quad 0x0F505B040B545F00, 0x154A411E114E451A
97 .quad 0x86E383E660056500, 0x12771772F491F194
98 .Lk_dsbo: // decryption sbox final output
99 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
100 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
101 .Lk_dsb9: // decryption sbox output *9*u, *9*t
102 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
103 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
104 .Lk_dsbd: // decryption sbox output *D*u, *D*t
105 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
106 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
107 .Lk_dsbb: // decryption sbox output *B*u, *B*t
108 .quad 0xD022649296B44200, 0x602646F6B0F2D404
109 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
110 .Lk_dsbe: // decryption sbox output *E*u, *E*t
111 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
112 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
115 // Key schedule constants
117 .Lk_dksd: // decryption key schedule: invskew x*D
118 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
119 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
120 .Lk_dksb: // decryption key schedule: invskew x*B
121 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
122 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
123 .Lk_dkse: // decryption key schedule: invskew x*E + 0x63
124 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
125 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
126 .Lk_dks9: // decryption key schedule: invskew x*9
127 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
128 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
131 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
133 .Lk_opt: // output transform
134 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
135 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
136 .Lk_deskew: // deskew tables: inverts the sbox's "skew"
137 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
138 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
140 .asciz "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)"
141 .size _vpaes_consts,.-_vpaes_consts
146 my ($inp,$out,$key) = map("x$_",(0..2));
148 my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23));
149 my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
150 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));
156 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
157 ## and %xmm9-%xmm15 as specified below.
159 .type _vpaes_encrypt_preheat,%function
161 _vpaes_encrypt_preheat:
164 ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv
165 ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo
166 ld1 {v24.2d-v27.2d}, [x10] // .Lk_sb1, .Lk_sb2
168 .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
173 ## AES-encrypt %xmm0.
177 ## %xmm9-%xmm15 as in _vpaes_preheat
178 ## (%rdx) = scheduled keys
181 ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
182 ## Preserves %xmm6 - %xmm8 so you get some local vectors
185 .type _vpaes_encrypt_core,%function
189 ldr w8, [$key,#240] // pull rounds
190 adr x11, .Lk_mc_forward+16
191 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
192 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
193 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
194 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
195 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
196 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
197 tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
198 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
199 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
204 // middle of middle round
206 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
207 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
208 tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
209 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
210 tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
211 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
212 tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
213 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
214 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
215 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
216 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
217 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
218 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
219 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
220 and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4
221 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
222 sub w8, w8, #1 // nr--
226 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
227 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
228 tbl v5.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
229 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
230 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
231 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
232 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
233 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
234 tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
235 tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
236 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
237 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
238 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
241 // middle of last round
243 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
244 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
245 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
246 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
247 tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
248 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
249 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
250 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
252 .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
255 .type vpaes_encrypt,%function
258 stp x29,x30,[sp,#-16]!
262 bl _vpaes_encrypt_preheat
263 bl _vpaes_encrypt_core
268 .size vpaes_encrypt,.-vpaes_encrypt
270 .type _vpaes_encrypt_2x,%function
274 ldr w8, [$key,#240] // pull rounds
275 adr x11, .Lk_mc_forward+16
276 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
277 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
278 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
279 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
280 and v9.16b, v15.16b, v17.16b
281 ushr v8.16b, v15.16b, #4
282 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
283 tbl v9.16b, {$iptlo}, v9.16b
284 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
285 tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
286 tbl v10.16b, {$ipthi}, v8.16b
287 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
288 eor v8.16b, v9.16b, v16.16b
289 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
290 eor v8.16b, v8.16b, v10.16b
295 // middle of middle round
297 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
298 tbl v12.16b, {$sb1t}, v10.16b
299 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
300 tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
301 tbl v8.16b, {$sb1u}, v11.16b
302 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
303 eor v12.16b, v12.16b, v16.16b
304 tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
305 tbl v13.16b, {$sb2t}, v10.16b
306 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
307 eor v8.16b, v8.16b, v12.16b
308 tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
309 tbl v10.16b, {$sb2u}, v11.16b
310 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
311 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
312 tbl v11.16b, {v8.16b}, v1.16b
313 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
314 eor v10.16b, v10.16b, v13.16b
315 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
316 tbl v8.16b, {v8.16b}, v4.16b
317 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
318 eor v11.16b, v11.16b, v10.16b
319 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
320 tbl v12.16b, {v11.16b},v1.16b
321 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
322 eor v8.16b, v8.16b, v11.16b
323 and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4
324 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
325 eor v8.16b, v8.16b, v12.16b
326 sub w8, w8, #1 // nr--
330 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
331 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
332 and v9.16b, v8.16b, v17.16b
333 ushr v8.16b, v8.16b, #4
334 tbl v5.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
335 tbl v13.16b, {$invhi},v9.16b
336 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
337 eor v9.16b, v9.16b, v8.16b
338 tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
339 tbl v11.16b, {$invlo},v8.16b
340 tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
341 tbl v12.16b, {$invlo},v9.16b
342 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
343 eor v11.16b, v11.16b, v13.16b
344 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
345 eor v12.16b, v12.16b, v13.16b
346 tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
347 tbl v10.16b, {$invlo},v11.16b
348 tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
349 tbl v11.16b, {$invlo},v12.16b
350 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
351 eor v10.16b, v10.16b, v9.16b
352 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
353 eor v11.16b, v11.16b, v8.16b
354 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
355 cbnz w8, .Lenc_2x_loop
357 // middle of last round
359 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
360 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
361 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
362 tbl v12.16b, {$sbou}, v10.16b
363 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
364 tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
365 tbl v8.16b, {$sbot}, v11.16b
366 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
367 eor v12.16b, v12.16b, v16.16b
368 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
369 eor v8.16b, v8.16b, v12.16b
370 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
371 tbl v1.16b, {v8.16b},v1.16b
373 .size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x
375 .type _vpaes_decrypt_preheat,%function
377 _vpaes_decrypt_preheat:
381 ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv
382 ld1 {v20.2d-v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo
383 ld1 {v24.2d-v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd
384 ld1 {v28.2d-v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe
386 .size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
391 ## Same API as encryption core.
393 .type _vpaes_decrypt_core,%function
397 ldr w8, [$key,#240] // pull rounds
399 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
400 lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11
401 eor x11, x11, #0x30 // xor \$0x30, %r11
403 and x11, x11, #0x30 // and \$0x30, %r11
405 adr x10, .Lk_mc_forward+48
407 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
408 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
409 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
410 tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
411 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
412 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
413 tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
414 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
415 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
421 // Inverse mix columns
423 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
424 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
425 tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
426 tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
427 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
428 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
429 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
430 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
432 tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
433 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
434 tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
435 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
436 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
437 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
438 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
440 tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
441 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
442 tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
443 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
444 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
445 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
446 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
448 tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
449 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
450 tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
451 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
452 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5
453 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
454 sub w8, w8, #1 // sub \$1,%rax # nr--
458 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
459 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
460 tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
461 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
462 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
463 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
464 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
465 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
466 tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
467 tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
468 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
469 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
470 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
473 // middle of last round
474 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
475 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
476 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
477 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
478 tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
479 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
480 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
481 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
483 .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
486 .type vpaes_decrypt,%function
489 stp x29,x30,[sp,#-16]!
493 bl _vpaes_decrypt_preheat
494 bl _vpaes_decrypt_core
499 .size vpaes_decrypt,.-vpaes_decrypt
501 // v14-v15 input, v0-v1 output
502 .type _vpaes_decrypt_2x,%function
506 ldr w8, [$key,#240] // pull rounds
508 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
509 lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11
510 eor x11, x11, #0x30 // xor \$0x30, %r11
512 and x11, x11, #0x30 // and \$0x30, %r11
514 adr x10, .Lk_mc_forward+48
516 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
517 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
518 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
519 and v9.16b, v15.16b, v17.16b
520 ushr v8.16b, v15.16b, #4
521 tbl v2.16b, {$iptlo},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
522 tbl v10.16b, {$iptlo},v9.16b
523 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
524 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
525 tbl v0.16b, {$ipthi},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
526 tbl v8.16b, {$ipthi},v8.16b
527 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
528 eor v10.16b, v10.16b, v16.16b
529 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
530 eor v8.16b, v8.16b, v10.16b
536 // Inverse mix columns
538 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
539 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
540 tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
541 tbl v12.16b, {$sb9u}, v10.16b
542 tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
543 tbl v9.16b, {$sb9t}, v11.16b
544 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
545 eor v8.16b, v12.16b, v16.16b
546 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
547 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
548 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
549 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
551 tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
552 tbl v12.16b, {$sbdu}, v10.16b
553 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
554 tbl v8.16b, {v8.16b},v5.16b
555 tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
556 tbl v9.16b, {$sbdt}, v11.16b
557 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
558 eor v8.16b, v8.16b, v12.16b
559 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
560 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
561 eor v8.16b, v8.16b, v9.16b
562 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
564 tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
565 tbl v12.16b, {$sbbu}, v10.16b
566 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
567 tbl v8.16b, {v8.16b},v5.16b
568 tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
569 tbl v9.16b, {$sbbt}, v11.16b
570 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
571 eor v8.16b, v8.16b, v12.16b
572 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
573 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
574 eor v8.16b, v8.16b, v9.16b
575 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
577 tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
578 tbl v12.16b, {$sbeu}, v10.16b
579 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
580 tbl v8.16b, {v8.16b},v5.16b
581 tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
582 tbl v9.16b, {$sbet}, v11.16b
583 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
584 eor v8.16b, v8.16b, v12.16b
585 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5
586 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
587 eor v8.16b, v8.16b, v9.16b
588 sub w8, w8, #1 // sub \$1,%rax # nr--
592 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
593 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
594 and v9.16b, v8.16b, v17.16b
595 ushr v8.16b, v8.16b, #4
596 tbl v2.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
597 tbl v10.16b, {$invhi},v9.16b
598 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
599 eor v9.16b, v9.16b, v8.16b
600 tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
601 tbl v11.16b, {$invlo},v8.16b
602 tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
603 tbl v12.16b, {$invlo},v9.16b
604 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
605 eor v11.16b, v11.16b, v10.16b
606 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
607 eor v12.16b, v12.16b, v10.16b
608 tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
609 tbl v10.16b, {$invlo},v11.16b
610 tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
611 tbl v11.16b, {$invlo},v12.16b
612 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
613 eor v10.16b, v10.16b, v9.16b
614 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
615 eor v11.16b, v11.16b, v8.16b
616 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
617 cbnz w8, .Ldec_2x_loop
619 // middle of last round
620 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
621 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
622 tbl v12.16b, {$sbou}, v10.16b
623 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
624 tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
625 tbl v9.16b, {$sbot}, v11.16b
626 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
627 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
628 eor v12.16b, v12.16b, v16.16b
629 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
630 eor v8.16b, v9.16b, v12.16b
631 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
632 tbl v1.16b, {v8.16b},v2.16b
634 .size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x
638 my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
639 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));
642 ########################################################
644 ## AES key schedule ##
646 ########################################################
647 .type _vpaes_key_preheat,%function
651 movi v16.16b, #0x5b // .Lk_s63
653 movi v17.16b, #0x0f // .Lk_s0F
654 ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt
656 ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1
657 adr x11, .Lk_mc_forward
658 ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb
659 ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9
660 ld1 {v8.2d}, [x10] // .Lk_rcon
661 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0]
663 .size _vpaes_key_preheat,.-_vpaes_key_preheat
665 .type _vpaes_schedule_core,%function
667 _vpaes_schedule_core:
668 stp x29, x30, [sp,#-16]!
671 bl _vpaes_key_preheat // load the tables
673 ld1 {v0.16b}, [$inp],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
676 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
677 bl _vpaes_schedule_transform
678 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
680 adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10
682 cbnz $dir, .Lschedule_am_decrypting
684 // encrypting, output zeroth round key after transform
685 st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx)
688 .Lschedule_am_decrypting:
689 // decrypting, output zeroth round key after shiftrows
690 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
691 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
692 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx)
693 eor x8, x8, #0x30 // xor \$0x30, %r8
696 cmp $bits, #192 // cmp \$192, %esi
704 ## 128-bit specific part of key schedule.
706 ## This schedule is really simple, because all its parts
707 ## are accomplished by the subroutines.
710 mov $inp, #10 // mov \$10, %esi
713 sub $inp, $inp, #1 // dec %esi
714 bl _vpaes_schedule_round
715 cbz $inp, .Lschedule_mangle_last
716 bl _vpaes_schedule_mangle // write output
722 ## 192-bit specific part of key schedule.
724 ## The main body of this schedule is the same as the 128-bit
725 ## schedule, but with more smearing. The long, high side is
726 ## stored in %xmm7 as before, and the short, low side is in
727 ## the high bits of %xmm6.
729 ## This schedule is somewhat nastier, however, because each
730 ## round produces 192 bits of key material, or 1.5 round keys.
731 ## Therefore, on each cycle we do 2 rounds and produce 3 round
737 ld1 {v0.16b}, [$inp] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
738 bl _vpaes_schedule_transform // input transform
739 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
740 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
741 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
742 mov $inp, #4 // mov \$4, %esi
745 sub $inp, $inp, #1 // dec %esi
746 bl _vpaes_schedule_round
747 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr \$8,%xmm6,%xmm0,%xmm0
748 bl _vpaes_schedule_mangle // save key n
749 bl _vpaes_schedule_192_smear
750 bl _vpaes_schedule_mangle // save key n+1
751 bl _vpaes_schedule_round
752 cbz $inp, .Lschedule_mangle_last
753 bl _vpaes_schedule_mangle // save key n+2
754 bl _vpaes_schedule_192_smear
760 ## 256-bit specific part of key schedule.
762 ## The structure here is very similar to the 128-bit
763 ## schedule, but with an additional "low side" in
764 ## %xmm6. The low side's rounds are the same as the
765 ## high side's, except no rcon and no rotation.
769 ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
770 bl _vpaes_schedule_transform // input transform
771 mov $inp, #7 // mov \$7, %esi
774 sub $inp, $inp, #1 // dec %esi
775 bl _vpaes_schedule_mangle // output low result
776 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
779 bl _vpaes_schedule_round
780 cbz $inp, .Lschedule_mangle_last
781 bl _vpaes_schedule_mangle
783 // low round. swap xmm7 and xmm6
784 dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
786 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
787 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
788 bl _vpaes_schedule_low_round
789 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
794 ## .aes_schedule_mangle_last
796 ## Mangler for last round of key schedule
798 ## when encrypting, outputs out(%xmm0) ^ 63
799 ## when decrypting, outputs unskew(%xmm0)
801 ## Always called right before return... jumps to cleanup and exits
804 .Lschedule_mangle_last:
805 // schedule last round key from xmm0
806 adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew
807 cbnz $dir, .Lschedule_mangle_last_dec
810 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
811 adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform
812 add $out, $out, #32 // add \$32, %rdx
813 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
815 .Lschedule_mangle_last_dec:
816 ld1 {v20.2d-v21.2d}, [x11] // reload constants
817 sub $out, $out, #16 // add \$-16, %rdx
818 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
819 bl _vpaes_schedule_transform // output transform
820 st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key
823 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
824 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
825 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
826 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
827 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
828 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
829 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
830 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
831 ldp x29, x30, [sp],#16
833 .size _vpaes_schedule_core,.-_vpaes_schedule_core
836 ## .aes_schedule_192_smear
838 ## Smear the short, low side in the 192-bit key schedule.
841 ## %xmm7: high side, b a x y
842 ## %xmm6: low side, d c 0 0
846 ## %xmm6: b+c+d b+c 0 0
847 ## %xmm0: b+c+d b+c b a
849 .type _vpaes_schedule_192_smear,%function
851 _vpaes_schedule_192_smear:
854 ins v1.s[3], v6.s[2] // vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
855 ins v0.s[0], v7.s[2] // vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
856 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
857 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
858 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
859 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
860 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
862 .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
865 ## .aes_schedule_round
867 ## Runs one main round of the key schedule on %xmm0, %xmm7
869 ## Specifically, runs subbytes on the high dword of %xmm0
870 ## then rotates it by one byte and xors into the low dword of
873 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
876 ## Smears the dwords of %xmm7 by xoring the low into the
877 ## second low, result into third, result into highest.
879 ## Returns results in %xmm7 = %xmm0.
880 ## Clobbers %xmm1-%xmm4, %r11.
882 .type _vpaes_schedule_round,%function
884 _vpaes_schedule_round:
885 // extract rcon from xmm8
886 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
887 ext v1.16b, $rcon, v4.16b, #15 // vpalignr \$15, %xmm8, %xmm4, %xmm1
888 ext $rcon, $rcon, $rcon, #15 // vpalignr \$15, %xmm8, %xmm8, %xmm8
889 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
892 dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
893 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr \$1, %xmm0, %xmm0, %xmm0
897 // low round: same as high round, but no rotation and no rcon.
898 _vpaes_schedule_low_round:
900 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq \$4, %xmm7, %xmm1
901 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
902 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq \$8, %xmm7, %xmm4
905 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
906 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
907 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
908 tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
909 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
910 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
911 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
912 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
913 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7
914 tbl v3.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
915 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
916 tbl v2.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
917 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
918 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
919 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
920 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
921 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
923 // add in smeared stuff
924 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
925 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
927 .size _vpaes_schedule_round,.-_vpaes_schedule_round
930 ## .aes_schedule_transform
932 ## Linear-transform %xmm0 according to tables at (%r11)
934 ## Requires that %xmm9 = 0x0F0F... as in preheat
936 ## Clobbers %xmm1, %xmm2
938 .type _vpaes_schedule_transform,%function
940 _vpaes_schedule_transform:
941 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
942 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
943 // vmovdqa (%r11), %xmm2 # lo
944 tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
945 // vmovdqa 16(%r11), %xmm1 # hi
946 tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
947 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
949 .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
952 ## .aes_schedule_mangle
954 ## Mangle xmm0 from (basis-transformed) standard version
959 ## multiply by circulant 0,1,1,1
960 ## apply shiftrows transform
964 ## multiply by "inverse mixcolumns" circulant E,B,D,9
966 ## apply shiftrows transform
969 ## Writes out to (%rdx), and increments or decrements it
970 ## Keeps track of round number mod 4 in %r8
972 ## Clobbers xmm1-xmm5
974 .type _vpaes_schedule_mangle,%function
976 _vpaes_schedule_mangle:
977 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
978 // vmovdqa .Lk_mc_forward(%rip),%xmm5
979 cbnz $dir, .Lschedule_mangle_dec
982 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4
983 add $out, $out, #16 // add \$16, %rdx
984 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
985 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
986 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
987 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
988 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
989 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
991 b .Lschedule_mangle_both
993 .Lschedule_mangle_dec:
994 // inverse mix columns
995 // lea .Lk_dksd(%rip),%r11
996 ushr v1.16b, v4.16b, #4 // vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
997 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
999 // vmovdqa 0x00(%r11), %xmm2
1000 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1001 // vmovdqa 0x10(%r11), %xmm3
1002 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1003 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1004 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1006 // vmovdqa 0x20(%r11), %xmm2
1007 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1008 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1009 // vmovdqa 0x30(%r11), %xmm3
1010 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1011 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1012 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1014 // vmovdqa 0x40(%r11), %xmm2
1015 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1016 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1017 // vmovdqa 0x50(%r11), %xmm3
1018 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1019 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1021 // vmovdqa 0x60(%r11), %xmm2
1022 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1023 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1024 // vmovdqa 0x70(%r11), %xmm4
1025 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
1026 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
1027 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1028 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
1030 sub $out, $out, #16 // add \$-16, %rdx
1032 .Lschedule_mangle_both:
1033 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1034 add x8, x8, #64-16 // add \$-16, %r8
1035 and x8, x8, #~(1<<6) // and \$0x30, %r8
1036 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx)
1038 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
1040 .globl vpaes_set_encrypt_key
1041 .type vpaes_set_encrypt_key,%function
1043 vpaes_set_encrypt_key:
1044 stp x29,x30,[sp,#-16]!
1046 stp d8,d9,[sp,#-16]! // ABI spec says so
1048 lsr w9, $bits, #5 // shr \$5,%eax
1049 add w9, w9, #5 // \$5,%eax
1050 str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1052 mov $dir, #0 // mov \$0,%ecx
1053 mov x8, #0x30 // mov \$0x30,%r8d
1054 bl _vpaes_schedule_core
1058 ldp x29,x30,[sp],#16
1060 .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
1062 .globl vpaes_set_decrypt_key
1063 .type vpaes_set_decrypt_key,%function
1065 vpaes_set_decrypt_key:
1066 stp x29,x30,[sp,#-16]!
1068 stp d8,d9,[sp,#-16]! // ABI spec says so
1070 lsr w9, $bits, #5 // shr \$5,%eax
1071 add w9, w9, #5 // \$5,%eax
1072 str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1073 lsl w9, w9, #4 // shl \$4,%eax
1074 add $out, $out, #16 // lea 16(%rdx,%rax),%rdx
1077 mov $dir, #1 // mov \$1,%ecx
1078 lsr w8, $bits, #1 // shr \$1,%r8d
1079 and x8, x8, #32 // and \$32,%r8d
1080 eor x8, x8, #32 // xor \$32,%r8d # nbits==192?0:32
1081 bl _vpaes_schedule_core
1084 ldp x29,x30,[sp],#16
1086 .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
1090 my ($inp,$out,$len,$key,$ivec,$dir) = map("x$_",(0..5));
1093 .globl vpaes_cbc_encrypt
1094 .type vpaes_cbc_encrypt,%function
1097 cbz $len, .Lcbc_abort
1098 cmp w5, #0 // check direction
1099 b.eq vpaes_cbc_decrypt
1101 stp x29,x30,[sp,#-16]!
1104 mov x17, $len // reassign
1105 mov x2, $key // reassign
1107 ld1 {v0.16b}, [$ivec] // load ivec
1108 bl _vpaes_encrypt_preheat
1113 ld1 {v7.16b}, [$inp],#16 // load input
1114 eor v7.16b, v7.16b, v0.16b // xor with ivec
1115 bl _vpaes_encrypt_core
1116 st1 {v0.16b}, [$out],#16 // save output
1120 st1 {v0.16b}, [$ivec] // write ivec
1122 ldp x29,x30,[sp],#16
1125 .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
1127 .type vpaes_cbc_decrypt,%function
1130 stp x29,x30,[sp,#-16]!
1132 stp d8,d9,[sp,#-16]! // ABI spec says so
1133 stp d10,d11,[sp,#-16]!
1134 stp d12,d13,[sp,#-16]!
1135 stp d14,d15,[sp,#-16]!
1137 mov x17, $len // reassign
1138 mov x2, $key // reassign
1139 ld1 {v6.16b}, [$ivec] // load ivec
1140 bl _vpaes_decrypt_preheat
1142 b.eq .Lcbc_dec_loop2x
1144 ld1 {v7.16b}, [$inp], #16 // load input
1145 bl _vpaes_decrypt_core
1146 eor v0.16b, v0.16b, v6.16b // xor with ivec
1147 orr v6.16b, v7.16b, v7.16b // next ivec value
1148 st1 {v0.16b}, [$out], #16
1154 ld1 {v14.16b,v15.16b}, [$inp], #32
1155 bl _vpaes_decrypt_2x
1156 eor v0.16b, v0.16b, v6.16b // xor with ivec
1157 eor v1.16b, v1.16b, v14.16b
1158 orr v6.16b, v15.16b, v15.16b
1159 st1 {v0.16b,v1.16b}, [$out], #32
1161 b.hi .Lcbc_dec_loop2x
1164 st1 {v6.16b}, [$ivec]
1166 ldp d14,d15,[sp],#16
1167 ldp d12,d13,[sp],#16
1168 ldp d10,d11,[sp],#16
1170 ldp x29,x30,[sp],#16
1172 .size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
1176 .globl vpaes_ecb_encrypt
1177 .type vpaes_ecb_encrypt,%function
1180 stp x29,x30,[sp,#-16]!
1182 stp d8,d9,[sp,#-16]! // ABI spec says so
1183 stp d10,d11,[sp,#-16]!
1184 stp d12,d13,[sp,#-16]!
1185 stp d14,d15,[sp,#-16]!
1189 bl _vpaes_encrypt_preheat
1193 ld1 {v7.16b}, [$inp],#16
1194 bl _vpaes_encrypt_core
1195 st1 {v0.16b}, [$out],#16
1201 ld1 {v14.16b,v15.16b}, [$inp], #32
1202 bl _vpaes_encrypt_2x
1203 st1 {v0.16b,v1.16b}, [$out], #32
1208 ldp d14,d15,[sp],#16
1209 ldp d12,d13,[sp],#16
1210 ldp d10,d11,[sp],#16
1212 ldp x29,x30,[sp],#16
1214 .size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
1216 .globl vpaes_ecb_decrypt
1217 .type vpaes_ecb_decrypt,%function
1220 stp x29,x30,[sp,#-16]!
1222 stp d8,d9,[sp,#-16]! // ABI spec says so
1223 stp d10,d11,[sp,#-16]!
1224 stp d12,d13,[sp,#-16]!
1225 stp d14,d15,[sp,#-16]!
1229 bl _vpaes_decrypt_preheat
1233 ld1 {v7.16b}, [$inp],#16
1234 bl _vpaes_encrypt_core
1235 st1 {v0.16b}, [$out],#16
1241 ld1 {v14.16b,v15.16b}, [$inp], #32
1242 bl _vpaes_decrypt_2x
1243 st1 {v0.16b,v1.16b}, [$out], #32
1248 ldp d14,d15,[sp],#16
1249 ldp d12,d13,[sp],#16
1250 ldp d10,d11,[sp],#16
1252 ldp x29,x30,[sp],#16
1254 .size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt