3 ######################################################################
4 ## Constant-time SSSE3 AES core implementation.
7 ## By Mike Hamburg (Stanford University), 2009
10 ## For details see http://shiftleft.org/papers/vector_aes/ and
11 ## http://crypto.stanford.edu/vpaes/.
13 ######################################################################
14 # ARMv8 NEON adaptation by <appro@openssl.org>
16 # Reason for undertaken effort is that there is at least one popular
17 # SoC based on Cortex-A53 that doesn't have crypto extensions.
19 # CBC enc ECB enc/dec(*) [bit-sliced enc/dec]
20 # Cortex-A53 21.5 18.1/20.6 [17.5/19.8 ]
21 # Cortex-A57 36.0(**) 20.4/24.9(**) [14.4/16.6 ]
22 # X-Gene 45.9(**) 45.8/57.7(**) [33.1/37.6(**) ]
23 # Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ]
24 # Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ]
26 # (*) ECB denotes approximate result for parallelizeable modes
27 # such as CBC decrypt, CTR, etc.;
28 # (**) these results are worse than scalar compiler-generated
29 # code, but it's constant-time and therefore preferred;
30 # (***) presented for reference/comparison purposes;
33 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
35 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
36 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
37 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
38 die "can't locate arm-xlate.pl";
40 open OUT,"| \"$^X\" $xlate $flavour $output";
46 .type _vpaes_consts,%object
47 .align 7 // totally strategic alignment
49 .Lk_mc_forward: // mc_forward
50 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
51 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
52 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
53 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
54 .Lk_mc_backward:// mc_backward
55 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
56 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
57 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
58 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
60 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
61 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
62 .quad 0x0F060D040B020900, 0x070E050C030A0108
63 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
69 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
70 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
71 .Lk_ipt: // input transform (lo, hi)
72 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
73 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
74 .Lk_sbo: // sbou, sbot
75 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
76 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
77 .Lk_sb1: // sb1u, sb1t
78 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
79 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
80 .Lk_sb2: // sb2u, sb2t
81 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
82 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
87 .Lk_dipt: // decryption input transform
88 .quad 0x0F505B040B545F00, 0x154A411E114E451A
89 .quad 0x86E383E660056500, 0x12771772F491F194
90 .Lk_dsbo: // decryption sbox final output
91 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
92 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
93 .Lk_dsb9: // decryption sbox output *9*u, *9*t
94 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
95 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
96 .Lk_dsbd: // decryption sbox output *D*u, *D*t
97 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
98 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
99 .Lk_dsbb: // decryption sbox output *B*u, *B*t
100 .quad 0xD022649296B44200, 0x602646F6B0F2D404
101 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
102 .Lk_dsbe: // decryption sbox output *E*u, *E*t
103 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
104 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
107 // Key schedule constants
109 .Lk_dksd: // decryption key schedule: invskew x*D
110 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
111 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
112 .Lk_dksb: // decryption key schedule: invskew x*B
113 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
114 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
115 .Lk_dkse: // decryption key schedule: invskew x*E + 0x63
116 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
117 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
118 .Lk_dks9: // decryption key schedule: invskew x*9
119 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
120 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
123 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
125 .Lk_opt: // output transform
126 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
127 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
128 .Lk_deskew: // deskew tables: inverts the sbox's "skew"
129 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
130 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
132 .asciz "Vector Permutaion AES for ARMv8, Mike Hamburg (Stanford University)"
133 .size _vpaes_consts,.-_vpaes_consts
138 my ($inp,$out,$key) = map("x$_",(0..2));
140 my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23));
141 my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
142 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));
148 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
149 ## and %xmm9-%xmm15 as specified below.
151 .type _vpaes_encrypt_preheat,%function
153 _vpaes_encrypt_preheat:
156 ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv
157 ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo
158 ld1 {v24.2d-v27.2d}, [x10] // .Lk_sb1, .Lk_sb2
160 .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
165 ## AES-encrypt %xmm0.
169 ## %xmm9-%xmm15 as in _vpaes_preheat
170 ## (%rdx) = scheduled keys
173 ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
174 ## Preserves %xmm6 - %xmm8 so you get some local vectors
177 .type _vpaes_encrypt_core,%function
181 ldr w8, [$key,#240] // pull rounds
182 adr x11, .Lk_mc_forward+16
183 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
184 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
185 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
186 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
187 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
188 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
189 tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
190 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
191 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
196 // middle of middle round
198 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
199 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
200 tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
201 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
202 tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
203 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
204 tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
205 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
206 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
207 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
208 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
209 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
210 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
211 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
212 and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4
213 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
214 sub w8, w8, #1 // nr--
218 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
219 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
220 tbl v5.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
221 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
222 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
223 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
224 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
225 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
226 tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
227 tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
228 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
229 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
230 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
233 // middle of last round
235 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
236 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
237 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
238 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
239 tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
240 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
241 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
242 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
244 .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
247 .type vpaes_encrypt,%function
250 stp x29,x30,[sp,#-16]!
254 bl _vpaes_encrypt_preheat
255 bl _vpaes_encrypt_core
260 .size vpaes_encrypt,.-vpaes_encrypt
262 .type _vpaes_encrypt_2x,%function
266 ldr w8, [$key,#240] // pull rounds
267 adr x11, .Lk_mc_forward+16
268 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
269 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
270 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
271 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
272 and v9.16b, v15.16b, v17.16b
273 ushr v8.16b, v15.16b, #4
274 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
275 tbl v9.16b, {$iptlo}, v9.16b
276 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
277 tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
278 tbl v10.16b, {$ipthi}, v8.16b
279 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
280 eor v8.16b, v9.16b, v16.16b
281 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
282 eor v8.16b, v8.16b, v10.16b
287 // middle of middle round
289 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
290 tbl v12.16b, {$sb1t}, v10.16b
291 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
292 tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
293 tbl v8.16b, {$sb1u}, v11.16b
294 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
295 eor v12.16b, v12.16b, v16.16b
296 tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
297 tbl v13.16b, {$sb2t}, v10.16b
298 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
299 eor v8.16b, v8.16b, v12.16b
300 tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
301 tbl v10.16b, {$sb2u}, v11.16b
302 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
303 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
304 tbl v11.16b, {v8.16b}, v1.16b
305 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
306 eor v10.16b, v10.16b, v13.16b
307 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
308 tbl v8.16b, {v8.16b}, v4.16b
309 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
310 eor v11.16b, v11.16b, v10.16b
311 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
312 tbl v12.16b, {v11.16b},v1.16b
313 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
314 eor v8.16b, v8.16b, v11.16b
315 and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4
316 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
317 eor v8.16b, v8.16b, v12.16b
318 sub w8, w8, #1 // nr--
322 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
323 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
324 and v9.16b, v8.16b, v17.16b
325 ushr v8.16b, v8.16b, #4
326 tbl v5.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
327 tbl v13.16b, {$invhi},v9.16b
328 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
329 eor v9.16b, v9.16b, v8.16b
330 tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
331 tbl v11.16b, {$invlo},v8.16b
332 tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
333 tbl v12.16b, {$invlo},v9.16b
334 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
335 eor v11.16b, v11.16b, v13.16b
336 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
337 eor v12.16b, v12.16b, v13.16b
338 tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
339 tbl v10.16b, {$invlo},v11.16b
340 tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
341 tbl v11.16b, {$invlo},v12.16b
342 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
343 eor v10.16b, v10.16b, v9.16b
344 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
345 eor v11.16b, v11.16b, v8.16b
346 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
347 cbnz w8, .Lenc_2x_loop
349 // middle of last round
351 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
352 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
353 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
354 tbl v12.16b, {$sbou}, v10.16b
355 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
356 tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
357 tbl v8.16b, {$sbot}, v11.16b
358 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
359 eor v12.16b, v12.16b, v16.16b
360 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
361 eor v8.16b, v8.16b, v12.16b
362 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
363 tbl v1.16b, {v8.16b},v1.16b
365 .size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x
367 .type _vpaes_decrypt_preheat,%function
369 _vpaes_decrypt_preheat:
373 ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv
374 ld1 {v20.2d-v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo
375 ld1 {v24.2d-v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd
376 ld1 {v28.2d-v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe
378 .size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
383 ## Same API as encryption core.
385 .type _vpaes_decrypt_core,%function
389 ldr w8, [$key,#240] // pull rounds
391 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
392 lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11
393 eor x11, x11, #0x30 // xor \$0x30, %r11
395 and x11, x11, #0x30 // and \$0x30, %r11
397 adr x10, .Lk_mc_forward+48
399 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
400 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
401 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
402 tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
403 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
404 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
405 tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
406 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
407 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
413 // Inverse mix columns
415 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
416 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
417 tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
418 tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
419 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
420 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
421 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
422 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
424 tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
425 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
426 tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
427 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
428 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
429 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
430 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
432 tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
433 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
434 tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
435 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
436 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
437 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
438 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
440 tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
441 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
442 tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
443 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
444 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5
445 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
446 sub w8, w8, #1 // sub \$1,%rax # nr--
450 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
451 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
452 tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
453 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
454 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
455 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
456 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
457 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
458 tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
459 tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
460 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
461 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
462 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
465 // middle of last round
466 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
467 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
468 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
469 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
470 tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
471 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
472 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
473 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
475 .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
478 .type vpaes_decrypt,%function
481 stp x29,x30,[sp,#-16]!
485 bl _vpaes_decrypt_preheat
486 bl _vpaes_decrypt_core
491 .size vpaes_decrypt,.-vpaes_decrypt
493 // v14-v15 input, v0-v1 output
494 .type _vpaes_decrypt_2x,%function
498 ldr w8, [$key,#240] // pull rounds
500 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
501 lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11
502 eor x11, x11, #0x30 // xor \$0x30, %r11
504 and x11, x11, #0x30 // and \$0x30, %r11
506 adr x10, .Lk_mc_forward+48
508 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
509 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
510 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
511 and v9.16b, v15.16b, v17.16b
512 ushr v8.16b, v15.16b, #4
513 tbl v2.16b, {$iptlo},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
514 tbl v10.16b, {$iptlo},v9.16b
515 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
516 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
517 tbl v0.16b, {$ipthi},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
518 tbl v8.16b, {$ipthi},v8.16b
519 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
520 eor v10.16b, v10.16b, v16.16b
521 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
522 eor v8.16b, v8.16b, v10.16b
528 // Inverse mix columns
530 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
531 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
532 tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
533 tbl v12.16b, {$sb9u}, v10.16b
534 tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
535 tbl v9.16b, {$sb9t}, v11.16b
536 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
537 eor v8.16b, v12.16b, v16.16b
538 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
539 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
540 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
541 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
543 tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
544 tbl v12.16b, {$sbdu}, v10.16b
545 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
546 tbl v8.16b, {v8.16b},v5.16b
547 tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
548 tbl v9.16b, {$sbdt}, v11.16b
549 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
550 eor v8.16b, v8.16b, v12.16b
551 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
552 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
553 eor v8.16b, v8.16b, v9.16b
554 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
556 tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
557 tbl v12.16b, {$sbbu}, v10.16b
558 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
559 tbl v8.16b, {v8.16b},v5.16b
560 tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
561 tbl v9.16b, {$sbbt}, v11.16b
562 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
563 eor v8.16b, v8.16b, v12.16b
564 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
565 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
566 eor v8.16b, v8.16b, v9.16b
567 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
569 tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
570 tbl v12.16b, {$sbeu}, v10.16b
571 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
572 tbl v8.16b, {v8.16b},v5.16b
573 tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
574 tbl v9.16b, {$sbet}, v11.16b
575 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
576 eor v8.16b, v8.16b, v12.16b
577 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5
578 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
579 eor v8.16b, v8.16b, v9.16b
580 sub w8, w8, #1 // sub \$1,%rax # nr--
584 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
585 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
586 and v9.16b, v8.16b, v17.16b
587 ushr v8.16b, v8.16b, #4
588 tbl v2.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
589 tbl v10.16b, {$invhi},v9.16b
590 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
591 eor v9.16b, v9.16b, v8.16b
592 tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
593 tbl v11.16b, {$invlo},v8.16b
594 tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
595 tbl v12.16b, {$invlo},v9.16b
596 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
597 eor v11.16b, v11.16b, v10.16b
598 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
599 eor v12.16b, v12.16b, v10.16b
600 tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
601 tbl v10.16b, {$invlo},v11.16b
602 tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
603 tbl v11.16b, {$invlo},v12.16b
604 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
605 eor v10.16b, v10.16b, v9.16b
606 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
607 eor v11.16b, v11.16b, v8.16b
608 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
609 cbnz w8, .Ldec_2x_loop
611 // middle of last round
612 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
613 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
614 tbl v12.16b, {$sbou}, v10.16b
615 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
616 tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
617 tbl v9.16b, {$sbot}, v11.16b
618 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
619 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
620 eor v12.16b, v12.16b, v16.16b
621 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
622 eor v8.16b, v9.16b, v12.16b
623 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
624 tbl v1.16b, {v8.16b},v2.16b
626 .size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x
630 my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
631 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));
634 ########################################################
636 ## AES key schedule ##
638 ########################################################
639 .type _vpaes_key_preheat,%function
643 movi v16.16b, #0x5b // .Lk_s63
645 movi v17.16b, #0x0f // .Lk_s0F
646 ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt
648 ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1
649 adr x11, .Lk_mc_forward
650 ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb
651 ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9
652 ld1 {v8.2d}, [x10] // .Lk_rcon
653 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0]
655 .size _vpaes_key_preheat,.-_vpaes_key_preheat
657 .type _vpaes_schedule_core,%function
659 _vpaes_schedule_core:
660 stp x29, x30, [sp,#-16]!
663 bl _vpaes_key_preheat // load the tables
665 ld1 {v0.16b}, [$inp],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
668 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
669 bl _vpaes_schedule_transform
670 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
672 adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10
674 cbnz $dir, .Lschedule_am_decrypting
676 // encrypting, output zeroth round key after transform
677 st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx)
680 .Lschedule_am_decrypting:
681 // decrypting, output zeroth round key after shiftrows
682 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
683 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
684 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx)
685 eor x8, x8, #0x30 // xor \$0x30, %r8
688 cmp $bits, #192 // cmp \$192, %esi
696 ## 128-bit specific part of key schedule.
698 ## This schedule is really simple, because all its parts
699 ## are accomplished by the subroutines.
702 mov $inp, #10 // mov \$10, %esi
705 sub $inp, $inp, #1 // dec %esi
706 bl _vpaes_schedule_round
707 cbz $inp, .Lschedule_mangle_last
708 bl _vpaes_schedule_mangle // write output
714 ## 192-bit specific part of key schedule.
716 ## The main body of this schedule is the same as the 128-bit
717 ## schedule, but with more smearing. The long, high side is
718 ## stored in %xmm7 as before, and the short, low side is in
719 ## the high bits of %xmm6.
721 ## This schedule is somewhat nastier, however, because each
722 ## round produces 192 bits of key material, or 1.5 round keys.
723 ## Therefore, on each cycle we do 2 rounds and produce 3 round
729 ld1 {v0.16b}, [$inp] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
730 bl _vpaes_schedule_transform // input transform
731 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
732 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
733 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
734 mov $inp, #4 // mov \$4, %esi
737 sub $inp, $inp, #1 // dec %esi
738 bl _vpaes_schedule_round
739 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr \$8,%xmm6,%xmm0,%xmm0
740 bl _vpaes_schedule_mangle // save key n
741 bl _vpaes_schedule_192_smear
742 bl _vpaes_schedule_mangle // save key n+1
743 bl _vpaes_schedule_round
744 cbz $inp, .Lschedule_mangle_last
745 bl _vpaes_schedule_mangle // save key n+2
746 bl _vpaes_schedule_192_smear
752 ## 256-bit specific part of key schedule.
754 ## The structure here is very similar to the 128-bit
755 ## schedule, but with an additional "low side" in
756 ## %xmm6. The low side's rounds are the same as the
757 ## high side's, except no rcon and no rotation.
761 ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
762 bl _vpaes_schedule_transform // input transform
763 mov $inp, #7 // mov \$7, %esi
766 sub $inp, $inp, #1 // dec %esi
767 bl _vpaes_schedule_mangle // output low result
768 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
771 bl _vpaes_schedule_round
772 cbz $inp, .Lschedule_mangle_last
773 bl _vpaes_schedule_mangle
775 // low round. swap xmm7 and xmm6
776 dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
778 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
779 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
780 bl _vpaes_schedule_low_round
781 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
786 ## .aes_schedule_mangle_last
788 ## Mangler for last round of key schedule
790 ## when encrypting, outputs out(%xmm0) ^ 63
791 ## when decrypting, outputs unskew(%xmm0)
793 ## Always called right before return... jumps to cleanup and exits
796 .Lschedule_mangle_last:
797 // schedule last round key from xmm0
798 adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew
799 cbnz $dir, .Lschedule_mangle_last_dec
802 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
803 adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform
804 add $out, $out, #32 // add \$32, %rdx
805 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
807 .Lschedule_mangle_last_dec:
808 ld1 {v20.2d-v21.2d}, [x11] // reload constants
809 sub $out, $out, #16 // add \$-16, %rdx
810 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
811 bl _vpaes_schedule_transform // output transform
812 st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key
815 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
816 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
817 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
818 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
819 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
820 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
821 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
822 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
823 ldp x29, x30, [sp],#16
825 .size _vpaes_schedule_core,.-_vpaes_schedule_core
828 ## .aes_schedule_192_smear
830 ## Smear the short, low side in the 192-bit key schedule.
833 ## %xmm7: high side, b a x y
834 ## %xmm6: low side, d c 0 0
838 ## %xmm6: b+c+d b+c 0 0
839 ## %xmm0: b+c+d b+c b a
841 .type _vpaes_schedule_192_smear,%function
843 _vpaes_schedule_192_smear:
846 ins v1.s[3], v6.s[2] // vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
847 ins v0.s[0], v7.s[2] // vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
848 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
849 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
850 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
851 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
852 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
854 .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
857 ## .aes_schedule_round
859 ## Runs one main round of the key schedule on %xmm0, %xmm7
861 ## Specifically, runs subbytes on the high dword of %xmm0
862 ## then rotates it by one byte and xors into the low dword of
865 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
868 ## Smears the dwords of %xmm7 by xoring the low into the
869 ## second low, result into third, result into highest.
871 ## Returns results in %xmm7 = %xmm0.
872 ## Clobbers %xmm1-%xmm4, %r11.
874 .type _vpaes_schedule_round,%function
876 _vpaes_schedule_round:
877 // extract rcon from xmm8
878 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
879 ext v1.16b, $rcon, v4.16b, #15 // vpalignr \$15, %xmm8, %xmm4, %xmm1
880 ext $rcon, $rcon, $rcon, #15 // vpalignr \$15, %xmm8, %xmm8, %xmm8
881 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
884 dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
885 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr \$1, %xmm0, %xmm0, %xmm0
889 // low round: same as high round, but no rotation and no rcon.
890 _vpaes_schedule_low_round:
892 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq \$4, %xmm7, %xmm1
893 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
894 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq \$8, %xmm7, %xmm4
897 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
898 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
899 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
900 tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
901 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
902 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
903 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
904 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
905 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7
906 tbl v3.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
907 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
908 tbl v2.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
909 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
910 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
911 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
912 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
913 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
915 // add in smeared stuff
916 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
917 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
919 .size _vpaes_schedule_round,.-_vpaes_schedule_round
922 ## .aes_schedule_transform
924 ## Linear-transform %xmm0 according to tables at (%r11)
926 ## Requires that %xmm9 = 0x0F0F... as in preheat
928 ## Clobbers %xmm1, %xmm2
930 .type _vpaes_schedule_transform,%function
932 _vpaes_schedule_transform:
933 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
934 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
935 // vmovdqa (%r11), %xmm2 # lo
936 tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
937 // vmovdqa 16(%r11), %xmm1 # hi
938 tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
939 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
941 .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
944 ## .aes_schedule_mangle
946 ## Mangle xmm0 from (basis-transformed) standard version
951 ## multiply by circulant 0,1,1,1
952 ## apply shiftrows transform
956 ## multiply by "inverse mixcolumns" circulant E,B,D,9
958 ## apply shiftrows transform
961 ## Writes out to (%rdx), and increments or decrements it
962 ## Keeps track of round number mod 4 in %r8
964 ## Clobbers xmm1-xmm5
966 .type _vpaes_schedule_mangle,%function
968 _vpaes_schedule_mangle:
969 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
970 // vmovdqa .Lk_mc_forward(%rip),%xmm5
971 cbnz $dir, .Lschedule_mangle_dec
974 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4
975 add $out, $out, #16 // add \$16, %rdx
976 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
977 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
978 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
979 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
980 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
981 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
983 b .Lschedule_mangle_both
985 .Lschedule_mangle_dec:
986 // inverse mix columns
987 // lea .Lk_dksd(%rip),%r11
988 ushr v1.16b, v4.16b, #4 // vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
989 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
991 // vmovdqa 0x00(%r11), %xmm2
992 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
993 // vmovdqa 0x10(%r11), %xmm3
994 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
995 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
996 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
998 // vmovdqa 0x20(%r11), %xmm2
999 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1000 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1001 // vmovdqa 0x30(%r11), %xmm3
1002 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1003 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1004 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1006 // vmovdqa 0x40(%r11), %xmm2
1007 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1008 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1009 // vmovdqa 0x50(%r11), %xmm3
1010 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1011 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1013 // vmovdqa 0x60(%r11), %xmm2
1014 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1015 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1016 // vmovdqa 0x70(%r11), %xmm4
1017 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
1018 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
1019 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1020 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
1022 sub $out, $out, #16 // add \$-16, %rdx
1024 .Lschedule_mangle_both:
1025 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1026 add x8, x8, #64-16 // add \$-16, %r8
1027 and x8, x8, #~(1<<6) // and \$0x30, %r8
1028 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx)
1030 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
1032 .globl vpaes_set_encrypt_key
1033 .type vpaes_set_encrypt_key,%function
1035 vpaes_set_encrypt_key:
1036 stp x29,x30,[sp,#-16]!
1038 stp d8,d9,[sp,#-16]! // ABI spec says so
1040 lsr w9, $bits, #5 // shr \$5,%eax
1041 add w9, w9, #5 // \$5,%eax
1042 str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1044 mov $dir, #0 // mov \$0,%ecx
1045 mov x8, #0x30 // mov \$0x30,%r8d
1046 bl _vpaes_schedule_core
1050 ldp x29,x30,[sp],#16
1052 .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
1054 .globl vpaes_set_decrypt_key
1055 .type vpaes_set_decrypt_key,%function
1057 vpaes_set_decrypt_key:
1058 stp x29,x30,[sp,#-16]!
1060 stp d8,d9,[sp,#-16]! // ABI spec says so
1062 lsr w9, $bits, #5 // shr \$5,%eax
1063 add w9, w9, #5 // \$5,%eax
1064 str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1065 lsl w9, w9, #4 // shl \$4,%eax
1066 add $out, $out, #16 // lea 16(%rdx,%rax),%rdx
1069 mov $dir, #1 // mov \$1,%ecx
1070 lsr w8, $bits, #1 // shr \$1,%r8d
1071 and x8, x8, #32 // and \$32,%r8d
1072 eor x8, x8, #32 // xor \$32,%r8d # nbits==192?0:32
1073 bl _vpaes_schedule_core
1076 ldp x29,x30,[sp],#16
1078 .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
1082 my ($inp,$out,$len,$key,$ivec,$dir) = map("x$_",(0..5));
1085 .globl vpaes_cbc_encrypt
1086 .type vpaes_cbc_encrypt,%function
1089 cbz $len, .Lcbc_abort
1090 cmp w5, #0 // check direction
1091 b.eq vpaes_cbc_decrypt
1093 stp x29,x30,[sp,#-16]!
1096 mov x17, $len // reassign
1097 mov x2, $key // reassign
1099 ld1 {v0.16b}, [$ivec] // load ivec
1100 bl _vpaes_encrypt_preheat
1105 ld1 {v7.16b}, [$inp],#16 // load input
1106 eor v7.16b, v7.16b, v0.16b // xor with ivec
1107 bl _vpaes_encrypt_core
1108 st1 {v0.16b}, [$out],#16 // save output
1112 st1 {v0.16b}, [$ivec] // write ivec
1114 ldp x29,x30,[sp],#16
1117 .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
1119 .type vpaes_cbc_decrypt,%function
1122 stp x29,x30,[sp,#-16]!
1124 stp d8,d9,[sp,#-16]! // ABI spec says so
1125 stp d10,d11,[sp,#-16]!
1126 stp d12,d13,[sp,#-16]!
1127 stp d14,d15,[sp,#-16]!
1129 mov x17, $len // reassign
1130 mov x2, $key // reassign
1131 ld1 {v6.16b}, [$ivec] // load ivec
1132 bl _vpaes_decrypt_preheat
1134 b.eq .Lcbc_dec_loop2x
1136 ld1 {v7.16b}, [$inp], #16 // load input
1137 bl _vpaes_decrypt_core
1138 eor v0.16b, v0.16b, v6.16b // xor with ivec
1139 orr v6.16b, v7.16b, v7.16b // next ivec value
1140 st1 {v0.16b}, [$out], #16
1146 ld1 {v14.16b,v15.16b}, [$inp], #32
1147 bl _vpaes_decrypt_2x
1148 eor v0.16b, v0.16b, v6.16b // xor with ivec
1149 eor v1.16b, v1.16b, v14.16b
1150 orr v6.16b, v15.16b, v15.16b
1151 st1 {v0.16b,v1.16b}, [$out], #32
1153 b.hi .Lcbc_dec_loop2x
1156 st1 {v6.16b}, [$ivec]
1158 ldp d14,d15,[sp],#16
1159 ldp d12,d13,[sp],#16
1160 ldp d10,d11,[sp],#16
1162 ldp x29,x30,[sp],#16
1164 .size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
1168 .globl vpaes_ecb_encrypt
1169 .type vpaes_ecb_encrypt,%function
1172 stp x29,x30,[sp,#-16]!
1174 stp d8,d9,[sp,#-16]! // ABI spec says so
1175 stp d10,d11,[sp,#-16]!
1176 stp d12,d13,[sp,#-16]!
1177 stp d14,d15,[sp,#-16]!
1181 bl _vpaes_encrypt_preheat
1185 ld1 {v7.16b}, [$inp],#16
1186 bl _vpaes_encrypt_core
1187 st1 {v0.16b}, [$out],#16
1193 ld1 {v14.16b,v15.16b}, [$inp], #32
1194 bl _vpaes_encrypt_2x
1195 st1 {v0.16b,v1.16b}, [$out], #32
1200 ldp d14,d15,[sp],#16
1201 ldp d12,d13,[sp],#16
1202 ldp d10,d11,[sp],#16
1204 ldp x29,x30,[sp],#16
1206 .size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
1208 .globl vpaes_ecb_decrypt
1209 .type vpaes_ecb_decrypt,%function
1212 stp x29,x30,[sp,#-16]!
1214 stp d8,d9,[sp,#-16]! // ABI spec says so
1215 stp d10,d11,[sp,#-16]!
1216 stp d12,d13,[sp,#-16]!
1217 stp d14,d15,[sp,#-16]!
1221 bl _vpaes_decrypt_preheat
1225 ld1 {v7.16b}, [$inp],#16
1226 bl _vpaes_encrypt_core
1227 st1 {v0.16b}, [$out],#16
1233 ld1 {v14.16b,v15.16b}, [$inp], #32
1234 bl _vpaes_decrypt_2x
1235 st1 {v0.16b,v1.16b}, [$out], #32
1240 ldp d14,d15,[sp],#16
1241 ldp d12,d13,[sp],#16
1242 ldp d10,d11,[sp],#16
1244 ldp x29,x30,[sp],#16
1246 .size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt