2 # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ######################################################################
11 ## Constant-time SSSE3 AES core implementation.
14 ## By Mike Hamburg (Stanford University), 2009
17 ## For details see http://shiftleft.org/papers/vector_aes/ and
18 ## http://crypto.stanford.edu/vpaes/.
20 ######################################################################
21 # ARMv8 NEON adaptation by <appro@openssl.org>
23 # Reason for undertaken effort is that there is at least one popular
24 # SoC based on Cortex-A53 that doesn't have crypto extensions.
26 # CBC enc ECB enc/dec(*) [bit-sliced enc/dec]
27 # Cortex-A53 21.5 18.1/20.6 [17.5/19.8 ]
28 # Cortex-A57 36.0(**) 20.4/24.9(**) [14.4/16.6 ]
29 # X-Gene 45.9(**) 45.8/57.7(**) [33.1/37.6(**) ]
30 # Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ]
31 # Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ]
32 # Mongoose(***) 26.3(**) 21.0/25.0(**) [13.3/16.8 ]
33 # ThunderX2(***) 39.4(**) 33.8/48.6(**)
35 # (*) ECB denotes approximate result for parallelizable modes
36 # such as CBC decrypt, CTR, etc.;
37 # (**) these results are worse than scalar compiler-generated
38 # code, but it's constant-time and therefore preferred;
39 # (***) presented for reference/comparison purposes;
41 # $output is the last argument if it looks like a file (it has an extension)
42 # $flavour is the first argument if it doesn't look like a file
43 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
44 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
48 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
49 die "can't locate arm-xlate.pl";
51 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
52 or die "can't call $xlate: $!";
58 .type _vpaes_consts,%object
59 .align 7 // totally strategic alignment
61 .Lk_mc_forward: // mc_forward
62 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
63 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
64 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
65 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
66 .Lk_mc_backward:// mc_backward
67 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
68 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
69 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
70 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
72 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
73 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
74 .quad 0x0F060D040B020900, 0x070E050C030A0108
75 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
81 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
82 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
83 .Lk_ipt: // input transform (lo, hi)
84 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
85 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
86 .Lk_sbo: // sbou, sbot
87 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
88 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
89 .Lk_sb1: // sb1u, sb1t
90 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
91 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
92 .Lk_sb2: // sb2u, sb2t
93 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
94 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
99 .Lk_dipt: // decryption input transform
100 .quad 0x0F505B040B545F00, 0x154A411E114E451A
101 .quad 0x86E383E660056500, 0x12771772F491F194
102 .Lk_dsbo: // decryption sbox final output
103 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
104 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
105 .Lk_dsb9: // decryption sbox output *9*u, *9*t
106 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
107 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
108 .Lk_dsbd: // decryption sbox output *D*u, *D*t
109 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
110 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
111 .Lk_dsbb: // decryption sbox output *B*u, *B*t
112 .quad 0xD022649296B44200, 0x602646F6B0F2D404
113 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
114 .Lk_dsbe: // decryption sbox output *E*u, *E*t
115 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
116 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
119 // Key schedule constants
121 .Lk_dksd: // decryption key schedule: invskew x*D
122 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
123 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
124 .Lk_dksb: // decryption key schedule: invskew x*B
125 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
126 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
127 .Lk_dkse: // decryption key schedule: invskew x*E + 0x63
128 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
129 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
130 .Lk_dks9: // decryption key schedule: invskew x*9
131 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
132 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
135 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
137 .Lk_opt: // output transform
138 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
139 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
140 .Lk_deskew: // deskew tables: inverts the sbox's "skew"
141 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
142 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
144 .asciz "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)"
145 .size _vpaes_consts,.-_vpaes_consts
150 my ($inp,$out,$key) = map("x$_",(0..2));
152 my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23));
153 my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
154 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));
160 // Fills register %r10 -> .aes_consts (so you can -fPIC)
161 // and %xmm9-%xmm15 as specified below.
163 .type _vpaes_encrypt_preheat,%function
165 _vpaes_encrypt_preheat:
168 ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv
169 ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo
170 ld1 {v24.2d-v27.2d}, [x10] // .Lk_sb1, .Lk_sb2
172 .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
177 // AES-encrypt %xmm0.
181 // %xmm9-%xmm15 as in _vpaes_preheat
182 // (%rdx) = scheduled keys
185 // Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
186 // Preserves %xmm6 - %xmm8 so you get some local vectors
189 .type _vpaes_encrypt_core,%function
193 ldr w8, [$key,#240] // pull rounds
194 adr x11, .Lk_mc_forward+16
195 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
196 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
197 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
198 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
199 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
200 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
201 tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
202 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
203 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
208 // middle of middle round
210 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
211 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
212 tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
213 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
214 tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
215 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
216 tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
217 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
218 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
219 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
220 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
221 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
222 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
223 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
224 and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4
225 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
226 sub w8, w8, #1 // nr--
230 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
231 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
232 tbl v5.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
233 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
234 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
235 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
236 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
237 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
238 tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
239 tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
240 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
241 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
242 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
245 // middle of last round
247 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
248 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
249 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
250 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
251 tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
252 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
253 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
254 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
256 .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
259 .type vpaes_encrypt,%function
262 .inst 0xd503233f // paciasp
263 stp x29,x30,[sp,#-16]!
267 bl _vpaes_encrypt_preheat
268 bl _vpaes_encrypt_core
272 .inst 0xd50323bf // autiasp
274 .size vpaes_encrypt,.-vpaes_encrypt
276 .type _vpaes_encrypt_2x,%function
280 ldr w8, [$key,#240] // pull rounds
281 adr x11, .Lk_mc_forward+16
282 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
283 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
284 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
285 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
286 and v9.16b, v15.16b, v17.16b
287 ushr v8.16b, v15.16b, #4
288 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
289 tbl v9.16b, {$iptlo}, v9.16b
290 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
291 tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
292 tbl v10.16b, {$ipthi}, v8.16b
293 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
294 eor v8.16b, v9.16b, v16.16b
295 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
296 eor v8.16b, v8.16b, v10.16b
301 // middle of middle round
303 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
304 tbl v12.16b, {$sb1t}, v10.16b
305 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
306 tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
307 tbl v8.16b, {$sb1u}, v11.16b
308 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
309 eor v12.16b, v12.16b, v16.16b
310 tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
311 tbl v13.16b, {$sb2t}, v10.16b
312 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
313 eor v8.16b, v8.16b, v12.16b
314 tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
315 tbl v10.16b, {$sb2u}, v11.16b
316 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
317 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
318 tbl v11.16b, {v8.16b}, v1.16b
319 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
320 eor v10.16b, v10.16b, v13.16b
321 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
322 tbl v8.16b, {v8.16b}, v4.16b
323 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
324 eor v11.16b, v11.16b, v10.16b
325 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
326 tbl v12.16b, {v11.16b},v1.16b
327 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
328 eor v8.16b, v8.16b, v11.16b
329 and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4
330 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
331 eor v8.16b, v8.16b, v12.16b
332 sub w8, w8, #1 // nr--
336 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
337 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
338 and v9.16b, v8.16b, v17.16b
339 ushr v8.16b, v8.16b, #4
340 tbl v5.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
341 tbl v13.16b, {$invhi},v9.16b
342 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
343 eor v9.16b, v9.16b, v8.16b
344 tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
345 tbl v11.16b, {$invlo},v8.16b
346 tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
347 tbl v12.16b, {$invlo},v9.16b
348 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
349 eor v11.16b, v11.16b, v13.16b
350 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
351 eor v12.16b, v12.16b, v13.16b
352 tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
353 tbl v10.16b, {$invlo},v11.16b
354 tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
355 tbl v11.16b, {$invlo},v12.16b
356 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
357 eor v10.16b, v10.16b, v9.16b
358 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
359 eor v11.16b, v11.16b, v8.16b
360 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
361 cbnz w8, .Lenc_2x_loop
363 // middle of last round
365 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
366 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
367 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
368 tbl v12.16b, {$sbou}, v10.16b
369 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
370 tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
371 tbl v8.16b, {$sbot}, v11.16b
372 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
373 eor v12.16b, v12.16b, v16.16b
374 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
375 eor v8.16b, v8.16b, v12.16b
376 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
377 tbl v1.16b, {v8.16b},v1.16b
379 .size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x
381 .type _vpaes_decrypt_preheat,%function
383 _vpaes_decrypt_preheat:
387 ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv
388 ld1 {v20.2d-v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo
389 ld1 {v24.2d-v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd
390 ld1 {v28.2d-v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe
392 .size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
397 // Same API as encryption core.
399 .type _vpaes_decrypt_core,%function
403 ldr w8, [$key,#240] // pull rounds
405 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
406 lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11
407 eor x11, x11, #0x30 // xor \$0x30, %r11
409 and x11, x11, #0x30 // and \$0x30, %r11
411 adr x10, .Lk_mc_forward+48
413 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
414 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
415 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
416 tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
417 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
418 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
419 tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
420 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
421 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
427 // Inverse mix columns
429 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
430 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
431 tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
432 tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
433 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
434 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
435 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
436 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
438 tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
439 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
440 tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
441 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
442 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
443 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
444 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
446 tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
447 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
448 tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
449 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
450 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
451 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
452 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
454 tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
455 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
456 tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
457 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
458 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5
459 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
460 sub w8, w8, #1 // sub \$1,%rax # nr--
464 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
465 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
466 tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
467 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
468 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
469 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
470 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
471 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
472 tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
473 tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
474 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
475 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
476 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
479 // middle of last round
480 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
481 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
482 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
483 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
484 tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
485 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
486 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
487 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
489 .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
492 .type vpaes_decrypt,%function
495 .inst 0xd503233f // paciasp
496 stp x29,x30,[sp,#-16]!
500 bl _vpaes_decrypt_preheat
501 bl _vpaes_decrypt_core
505 .inst 0xd50323bf // autiasp
507 .size vpaes_decrypt,.-vpaes_decrypt
509 // v14-v15 input, v0-v1 output
510 .type _vpaes_decrypt_2x,%function
514 ldr w8, [$key,#240] // pull rounds
516 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
517 lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11
518 eor x11, x11, #0x30 // xor \$0x30, %r11
520 and x11, x11, #0x30 // and \$0x30, %r11
522 adr x10, .Lk_mc_forward+48
524 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
525 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
526 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
527 and v9.16b, v15.16b, v17.16b
528 ushr v8.16b, v15.16b, #4
529 tbl v2.16b, {$iptlo},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
530 tbl v10.16b, {$iptlo},v9.16b
531 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
532 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
533 tbl v0.16b, {$ipthi},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
534 tbl v8.16b, {$ipthi},v8.16b
535 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
536 eor v10.16b, v10.16b, v16.16b
537 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
538 eor v8.16b, v8.16b, v10.16b
544 // Inverse mix columns
546 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
547 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
548 tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
549 tbl v12.16b, {$sb9u}, v10.16b
550 tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
551 tbl v9.16b, {$sb9t}, v11.16b
552 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
553 eor v8.16b, v12.16b, v16.16b
554 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
555 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
556 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
557 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
559 tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
560 tbl v12.16b, {$sbdu}, v10.16b
561 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
562 tbl v8.16b, {v8.16b},v5.16b
563 tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
564 tbl v9.16b, {$sbdt}, v11.16b
565 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
566 eor v8.16b, v8.16b, v12.16b
567 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
568 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
569 eor v8.16b, v8.16b, v9.16b
570 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
572 tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
573 tbl v12.16b, {$sbbu}, v10.16b
574 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
575 tbl v8.16b, {v8.16b},v5.16b
576 tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
577 tbl v9.16b, {$sbbt}, v11.16b
578 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
579 eor v8.16b, v8.16b, v12.16b
580 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
581 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
582 eor v8.16b, v8.16b, v9.16b
583 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
585 tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
586 tbl v12.16b, {$sbeu}, v10.16b
587 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
588 tbl v8.16b, {v8.16b},v5.16b
589 tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
590 tbl v9.16b, {$sbet}, v11.16b
591 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
592 eor v8.16b, v8.16b, v12.16b
593 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5
594 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
595 eor v8.16b, v8.16b, v9.16b
596 sub w8, w8, #1 // sub \$1,%rax # nr--
600 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
601 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
602 and v9.16b, v8.16b, v17.16b
603 ushr v8.16b, v8.16b, #4
604 tbl v2.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
605 tbl v10.16b, {$invhi},v9.16b
606 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
607 eor v9.16b, v9.16b, v8.16b
608 tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
609 tbl v11.16b, {$invlo},v8.16b
610 tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
611 tbl v12.16b, {$invlo},v9.16b
612 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
613 eor v11.16b, v11.16b, v10.16b
614 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
615 eor v12.16b, v12.16b, v10.16b
616 tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
617 tbl v10.16b, {$invlo},v11.16b
618 tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
619 tbl v11.16b, {$invlo},v12.16b
620 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
621 eor v10.16b, v10.16b, v9.16b
622 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
623 eor v11.16b, v11.16b, v8.16b
624 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
625 cbnz w8, .Ldec_2x_loop
627 // middle of last round
628 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
629 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
630 tbl v12.16b, {$sbou}, v10.16b
631 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
632 tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
633 tbl v9.16b, {$sbot}, v11.16b
634 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
635 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
636 eor v12.16b, v12.16b, v16.16b
637 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
638 eor v8.16b, v9.16b, v12.16b
639 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
640 tbl v1.16b, {v8.16b},v2.16b
642 .size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x
646 my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
647 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));
650 ////////////////////////////////////////////////////////
652 // AES key schedule //
654 ////////////////////////////////////////////////////////
655 .type _vpaes_key_preheat,%function
659 movi v16.16b, #0x5b // .Lk_s63
661 movi v17.16b, #0x0f // .Lk_s0F
662 ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt
664 ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1
665 adr x11, .Lk_mc_forward
666 ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb
667 ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9
668 ld1 {v8.2d}, [x10] // .Lk_rcon
669 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0]
671 .size _vpaes_key_preheat,.-_vpaes_key_preheat
673 .type _vpaes_schedule_core,%function
675 _vpaes_schedule_core:
676 .inst 0xd503233f // paciasp
677 stp x29, x30, [sp,#-16]!
680 bl _vpaes_key_preheat // load the tables
682 ld1 {v0.16b}, [$inp],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
685 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
686 bl _vpaes_schedule_transform
687 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
689 adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10
691 cbnz $dir, .Lschedule_am_decrypting
693 // encrypting, output zeroth round key after transform
694 st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx)
697 .Lschedule_am_decrypting:
698 // decrypting, output zeroth round key after shiftrows
699 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
700 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
701 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx)
702 eor x8, x8, #0x30 // xor \$0x30, %r8
705 cmp $bits, #192 // cmp \$192, %esi
713 // 128-bit specific part of key schedule.
715 // This schedule is really simple, because all its parts
716 // are accomplished by the subroutines.
719 mov $inp, #10 // mov \$10, %esi
722 sub $inp, $inp, #1 // dec %esi
723 bl _vpaes_schedule_round
724 cbz $inp, .Lschedule_mangle_last
725 bl _vpaes_schedule_mangle // write output
731 // 192-bit specific part of key schedule.
733 // The main body of this schedule is the same as the 128-bit
734 // schedule, but with more smearing. The long, high side is
735 // stored in %xmm7 as before, and the short, low side is in
736 // the high bits of %xmm6.
738 // This schedule is somewhat nastier, however, because each
739 // round produces 192 bits of key material, or 1.5 round keys.
740 // Therefore, on each cycle we do 2 rounds and produce 3 round
746 ld1 {v0.16b}, [$inp] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
747 bl _vpaes_schedule_transform // input transform
748 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
749 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
750 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
751 mov $inp, #4 // mov \$4, %esi
754 sub $inp, $inp, #1 // dec %esi
755 bl _vpaes_schedule_round
756 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr \$8,%xmm6,%xmm0,%xmm0
757 bl _vpaes_schedule_mangle // save key n
758 bl _vpaes_schedule_192_smear
759 bl _vpaes_schedule_mangle // save key n+1
760 bl _vpaes_schedule_round
761 cbz $inp, .Lschedule_mangle_last
762 bl _vpaes_schedule_mangle // save key n+2
763 bl _vpaes_schedule_192_smear
769 // 256-bit specific part of key schedule.
771 // The structure here is very similar to the 128-bit
772 // schedule, but with an additional "low side" in
773 // %xmm6. The low side's rounds are the same as the
774 // high side's, except no rcon and no rotation.
778 ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
779 bl _vpaes_schedule_transform // input transform
780 mov $inp, #7 // mov \$7, %esi
783 sub $inp, $inp, #1 // dec %esi
784 bl _vpaes_schedule_mangle // output low result
785 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
788 bl _vpaes_schedule_round
789 cbz $inp, .Lschedule_mangle_last
790 bl _vpaes_schedule_mangle
792 // low round. swap xmm7 and xmm6
793 dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
795 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
796 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
797 bl _vpaes_schedule_low_round
798 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
803 // .aes_schedule_mangle_last
805 // Mangler for last round of key schedule
807 // when encrypting, outputs out(%xmm0) ^ 63
808 // when decrypting, outputs unskew(%xmm0)
810 // Always called right before return... jumps to cleanup and exits
813 .Lschedule_mangle_last:
814 // schedule last round key from xmm0
815 adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew
816 cbnz $dir, .Lschedule_mangle_last_dec
819 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
820 adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform
821 add $out, $out, #32 // add \$32, %rdx
822 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
824 .Lschedule_mangle_last_dec:
825 ld1 {v20.2d-v21.2d}, [x11] // reload constants
826 sub $out, $out, #16 // add \$-16, %rdx
827 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
828 bl _vpaes_schedule_transform // output transform
829 st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key
832 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
833 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
834 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
835 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
836 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
837 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
838 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
839 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
840 ldp x29, x30, [sp],#16
841 .inst 0xd50323bf // autiasp
843 .size _vpaes_schedule_core,.-_vpaes_schedule_core
846 // .aes_schedule_192_smear
848 // Smear the short, low side in the 192-bit key schedule.
851 // %xmm7: high side, b a x y
852 // %xmm6: low side, d c 0 0
856 // %xmm6: b+c+d b+c 0 0
857 // %xmm0: b+c+d b+c b a
859 .type _vpaes_schedule_192_smear,%function
861 _vpaes_schedule_192_smear:
864 ins v1.s[3], v6.s[2] // vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
865 ins v0.s[0], v7.s[2] // vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
866 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
867 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
868 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
869 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
870 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
872 .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
875 // .aes_schedule_round
877 // Runs one main round of the key schedule on %xmm0, %xmm7
879 // Specifically, runs subbytes on the high dword of %xmm0
880 // then rotates it by one byte and xors into the low dword of
883 // Adds rcon from low byte of %xmm8, then rotates %xmm8 for
886 // Smears the dwords of %xmm7 by xoring the low into the
887 // second low, result into third, result into highest.
889 // Returns results in %xmm7 = %xmm0.
890 // Clobbers %xmm1-%xmm4, %r11.
892 .type _vpaes_schedule_round,%function
894 _vpaes_schedule_round:
895 // extract rcon from xmm8
896 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
897 ext v1.16b, $rcon, v4.16b, #15 // vpalignr \$15, %xmm8, %xmm4, %xmm1
898 ext $rcon, $rcon, $rcon, #15 // vpalignr \$15, %xmm8, %xmm8, %xmm8
899 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
902 dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
903 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr \$1, %xmm0, %xmm0, %xmm0
907 // low round: same as high round, but no rotation and no rcon.
908 _vpaes_schedule_low_round:
910 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq \$4, %xmm7, %xmm1
911 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
912 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq \$8, %xmm7, %xmm4
915 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
916 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
917 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
918 tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
919 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
920 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
921 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
922 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
923 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7
924 tbl v3.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
925 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
926 tbl v2.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
927 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
928 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
929 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
930 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
931 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
933 // add in smeared stuff
934 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
935 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
937 .size _vpaes_schedule_round,.-_vpaes_schedule_round
940 // .aes_schedule_transform
942 // Linear-transform %xmm0 according to tables at (%r11)
944 // Requires that %xmm9 = 0x0F0F... as in preheat
946 // Clobbers %xmm1, %xmm2
948 .type _vpaes_schedule_transform,%function
950 _vpaes_schedule_transform:
951 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
952 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
953 // vmovdqa (%r11), %xmm2 # lo
954 tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
955 // vmovdqa 16(%r11), %xmm1 # hi
956 tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
957 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
959 .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
962 // .aes_schedule_mangle
964 // Mangle xmm0 from (basis-transformed) standard version
969 // multiply by circulant 0,1,1,1
970 // apply shiftrows transform
974 // multiply by "inverse mixcolumns" circulant E,B,D,9
976 // apply shiftrows transform
979 // Writes out to (%rdx), and increments or decrements it
980 // Keeps track of round number mod 4 in %r8
982 // Clobbers xmm1-xmm5
984 .type _vpaes_schedule_mangle,%function
986 _vpaes_schedule_mangle:
987 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
988 // vmovdqa .Lk_mc_forward(%rip),%xmm5
989 cbnz $dir, .Lschedule_mangle_dec
992 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4
993 add $out, $out, #16 // add \$16, %rdx
994 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
995 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
996 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
997 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
998 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
999 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
1001 b .Lschedule_mangle_both
1003 .Lschedule_mangle_dec:
1004 // inverse mix columns
1005 // lea .Lk_dksd(%rip),%r11
1006 ushr v1.16b, v4.16b, #4 // vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
1007 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
1009 // vmovdqa 0x00(%r11), %xmm2
1010 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1011 // vmovdqa 0x10(%r11), %xmm3
1012 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1013 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1014 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1016 // vmovdqa 0x20(%r11), %xmm2
1017 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1018 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1019 // vmovdqa 0x30(%r11), %xmm3
1020 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1021 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1022 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1024 // vmovdqa 0x40(%r11), %xmm2
1025 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1026 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1027 // vmovdqa 0x50(%r11), %xmm3
1028 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1029 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1031 // vmovdqa 0x60(%r11), %xmm2
1032 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1033 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1034 // vmovdqa 0x70(%r11), %xmm4
1035 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
1036 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
1037 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1038 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
1040 sub $out, $out, #16 // add \$-16, %rdx
1042 .Lschedule_mangle_both:
1043 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1044 add x8, x8, #64-16 // add \$-16, %r8
1045 and x8, x8, #~(1<<6) // and \$0x30, %r8
1046 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx)
1048 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
1050 .globl vpaes_set_encrypt_key
1051 .type vpaes_set_encrypt_key,%function
1053 vpaes_set_encrypt_key:
1054 .inst 0xd503233f // paciasp
1055 stp x29,x30,[sp,#-16]!
1057 stp d8,d9,[sp,#-16]! // ABI spec says so
1059 lsr w9, $bits, #5 // shr \$5,%eax
1060 add w9, w9, #5 // \$5,%eax
1061 str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1063 mov $dir, #0 // mov \$0,%ecx
1064 mov x8, #0x30 // mov \$0x30,%r8d
1065 bl _vpaes_schedule_core
1069 ldp x29,x30,[sp],#16
1070 .inst 0xd50323bf // autiasp
1072 .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
1074 .globl vpaes_set_decrypt_key
1075 .type vpaes_set_decrypt_key,%function
1077 vpaes_set_decrypt_key:
1078 .inst 0xd503233f // paciasp
1079 stp x29,x30,[sp,#-16]!
1081 stp d8,d9,[sp,#-16]! // ABI spec says so
1083 lsr w9, $bits, #5 // shr \$5,%eax
1084 add w9, w9, #5 // \$5,%eax
1085 str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1086 lsl w9, w9, #4 // shl \$4,%eax
1087 add $out, $out, #16 // lea 16(%rdx,%rax),%rdx
1090 mov $dir, #1 // mov \$1,%ecx
1091 lsr w8, $bits, #1 // shr \$1,%r8d
1092 and x8, x8, #32 // and \$32,%r8d
1093 eor x8, x8, #32 // xor \$32,%r8d # nbits==192?0:32
1094 bl _vpaes_schedule_core
1097 ldp x29,x30,[sp],#16
1098 .inst 0xd50323bf // autiasp
1100 .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
1104 my ($inp,$out,$len,$key,$ivec,$dir) = map("x$_",(0..5));
1107 .globl vpaes_cbc_encrypt
1108 .type vpaes_cbc_encrypt,%function
1111 cbz $len, .Lcbc_abort
1112 cmp w5, #0 // check direction
1113 b.eq vpaes_cbc_decrypt
1115 .inst 0xd503233f // paciasp
1116 stp x29,x30,[sp,#-16]!
1119 mov x17, $len // reassign
1120 mov x2, $key // reassign
1122 ld1 {v0.16b}, [$ivec] // load ivec
1123 bl _vpaes_encrypt_preheat
1128 ld1 {v7.16b}, [$inp],#16 // load input
1129 eor v7.16b, v7.16b, v0.16b // xor with ivec
1130 bl _vpaes_encrypt_core
1131 st1 {v0.16b}, [$out],#16 // save output
1135 st1 {v0.16b}, [$ivec] // write ivec
1137 ldp x29,x30,[sp],#16
1138 .inst 0xd50323bf // autiasp
1141 .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
1143 .type vpaes_cbc_decrypt,%function
1146 .inst 0xd503233f // paciasp
1147 stp x29,x30,[sp,#-16]!
1149 stp d8,d9,[sp,#-16]! // ABI spec says so
1150 stp d10,d11,[sp,#-16]!
1151 stp d12,d13,[sp,#-16]!
1152 stp d14,d15,[sp,#-16]!
1154 mov x17, $len // reassign
1155 mov x2, $key // reassign
1156 ld1 {v6.16b}, [$ivec] // load ivec
1157 bl _vpaes_decrypt_preheat
1159 b.eq .Lcbc_dec_loop2x
1161 ld1 {v7.16b}, [$inp], #16 // load input
1162 bl _vpaes_decrypt_core
1163 eor v0.16b, v0.16b, v6.16b // xor with ivec
1164 orr v6.16b, v7.16b, v7.16b // next ivec value
1165 st1 {v0.16b}, [$out], #16
1171 ld1 {v14.16b,v15.16b}, [$inp], #32
1172 bl _vpaes_decrypt_2x
1173 eor v0.16b, v0.16b, v6.16b // xor with ivec
1174 eor v1.16b, v1.16b, v14.16b
1175 orr v6.16b, v15.16b, v15.16b
1176 st1 {v0.16b,v1.16b}, [$out], #32
1178 b.hi .Lcbc_dec_loop2x
1181 st1 {v6.16b}, [$ivec]
1183 ldp d14,d15,[sp],#16
1184 ldp d12,d13,[sp],#16
1185 ldp d10,d11,[sp],#16
1187 ldp x29,x30,[sp],#16
1188 .inst 0xd50323bf // autiasp
1190 .size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
1194 .globl vpaes_ecb_encrypt
1195 .type vpaes_ecb_encrypt,%function
1198 .inst 0xd503233f // paciasp
1199 stp x29,x30,[sp,#-16]!
1201 stp d8,d9,[sp,#-16]! // ABI spec says so
1202 stp d10,d11,[sp,#-16]!
1203 stp d12,d13,[sp,#-16]!
1204 stp d14,d15,[sp,#-16]!
1208 bl _vpaes_encrypt_preheat
1212 ld1 {v7.16b}, [$inp],#16
1213 bl _vpaes_encrypt_core
1214 st1 {v0.16b}, [$out],#16
1220 ld1 {v14.16b,v15.16b}, [$inp], #32
1221 bl _vpaes_encrypt_2x
1222 st1 {v0.16b,v1.16b}, [$out], #32
1227 ldp d14,d15,[sp],#16
1228 ldp d12,d13,[sp],#16
1229 ldp d10,d11,[sp],#16
1231 ldp x29,x30,[sp],#16
1232 .inst 0xd50323bf // autiasp
1234 .size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
1236 .globl vpaes_ecb_decrypt
1237 .type vpaes_ecb_decrypt,%function
1240 .inst 0xd503233f // paciasp
1241 stp x29,x30,[sp,#-16]!
1243 stp d8,d9,[sp,#-16]! // ABI spec says so
1244 stp d10,d11,[sp,#-16]!
1245 stp d12,d13,[sp,#-16]!
1246 stp d14,d15,[sp,#-16]!
1250 bl _vpaes_decrypt_preheat
1254 ld1 {v7.16b}, [$inp],#16
1255 bl _vpaes_encrypt_core
1256 st1 {v0.16b}, [$out],#16
1262 ld1 {v14.16b,v15.16b}, [$inp], #32
1263 bl _vpaes_decrypt_2x
1264 st1 {v0.16b,v1.16b}, [$out], #32
1269 ldp d14,d15,[sp],#16
1270 ldp d12,d13,[sp],#16
1271 ldp d10,d11,[sp],#16
1273 ldp x29,x30,[sp],#16
1274 .inst 0xd50323bf // autiasp
1276 .size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt
1281 close STDOUT or die "error closing STDOUT: $!";