Linux-libre 3.16.85-gnu
[librecmc/linux-libre.git] / arch / arm64 / crypto / aes-modes.S
1 /*
2  * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES
3  *
4  * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License version 2 as
8  * published by the Free Software Foundation.
9  */
10
11 /* included by aes-ce.S and aes-neon.S */
12
13         .text
14         .align          4
15
16 /*
17  * There are several ways to instantiate this code:
18  * - no interleave, all inline
19  * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2)
20  * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE)
21  * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4)
22  * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE)
23  *
24  * Macros imported by this code:
25  * - enc_prepare        - setup NEON registers for encryption
26  * - dec_prepare        - setup NEON registers for decryption
27  * - enc_switch_key     - change to new key after having prepared for encryption
28  * - encrypt_block      - encrypt a single block
29  * - decrypt block      - decrypt a single block
30  * - encrypt_block2x    - encrypt 2 blocks in parallel (if INTERLEAVE == 2)
31  * - decrypt_block2x    - decrypt 2 blocks in parallel (if INTERLEAVE == 2)
32  * - encrypt_block4x    - encrypt 4 blocks in parallel (if INTERLEAVE == 4)
33  * - decrypt_block4x    - decrypt 4 blocks in parallel (if INTERLEAVE == 4)
34  */
35
36 #if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE)
37 #define FRAME_PUSH      stp x29, x30, [sp,#-16]! ; mov x29, sp
38 #define FRAME_POP       ldp x29, x30, [sp],#16
39
40 #if INTERLEAVE == 2
41
42 aes_encrypt_block2x:
43         encrypt_block2x v0, v1, w3, x2, x6, w7
44         ret
45 ENDPROC(aes_encrypt_block2x)
46
47 aes_decrypt_block2x:
48         decrypt_block2x v0, v1, w3, x2, x6, w7
49         ret
50 ENDPROC(aes_decrypt_block2x)
51
52 #elif INTERLEAVE == 4
53
54 aes_encrypt_block4x:
55         encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
56         ret
57 ENDPROC(aes_encrypt_block4x)
58
59 aes_decrypt_block4x:
60         decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
61         ret
62 ENDPROC(aes_decrypt_block4x)
63
64 #else
65 #error INTERLEAVE should equal 2 or 4
66 #endif
67
68         .macro          do_encrypt_block2x
69         bl              aes_encrypt_block2x
70         .endm
71
72         .macro          do_decrypt_block2x
73         bl              aes_decrypt_block2x
74         .endm
75
76         .macro          do_encrypt_block4x
77         bl              aes_encrypt_block4x
78         .endm
79
80         .macro          do_decrypt_block4x
81         bl              aes_decrypt_block4x
82         .endm
83
84 #else
85 #define FRAME_PUSH
86 #define FRAME_POP
87
88         .macro          do_encrypt_block2x
89         encrypt_block2x v0, v1, w3, x2, x6, w7
90         .endm
91
92         .macro          do_decrypt_block2x
93         decrypt_block2x v0, v1, w3, x2, x6, w7
94         .endm
95
96         .macro          do_encrypt_block4x
97         encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
98         .endm
99
100         .macro          do_decrypt_block4x
101         decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7
102         .endm
103
104 #endif
105
106         /*
107          * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
108          *                 int blocks, int first)
109          * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
110          *                 int blocks, int first)
111          */
112
113 AES_ENTRY(aes_ecb_encrypt)
114         FRAME_PUSH
115         cbz             w5, .LecbencloopNx
116
117         enc_prepare     w3, x2, x5
118
119 .LecbencloopNx:
120 #if INTERLEAVE >= 2
121         subs            w4, w4, #INTERLEAVE
122         bmi             .Lecbenc1x
123 #if INTERLEAVE == 2
124         ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 pt blocks */
125         do_encrypt_block2x
126         st1             {v0.16b-v1.16b}, [x0], #32
127 #else
128         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
129         do_encrypt_block4x
130         st1             {v0.16b-v3.16b}, [x0], #64
131 #endif
132         b               .LecbencloopNx
133 .Lecbenc1x:
134         adds            w4, w4, #INTERLEAVE
135         beq             .Lecbencout
136 #endif
137 .Lecbencloop:
138         ld1             {v0.16b}, [x1], #16             /* get next pt block */
139         encrypt_block   v0, w3, x2, x5, w6
140         st1             {v0.16b}, [x0], #16
141         subs            w4, w4, #1
142         bne             .Lecbencloop
143 .Lecbencout:
144         FRAME_POP
145         ret
146 AES_ENDPROC(aes_ecb_encrypt)
147
148
149 AES_ENTRY(aes_ecb_decrypt)
150         FRAME_PUSH
151         cbz             w5, .LecbdecloopNx
152
153         dec_prepare     w3, x2, x5
154
155 .LecbdecloopNx:
156 #if INTERLEAVE >= 2
157         subs            w4, w4, #INTERLEAVE
158         bmi             .Lecbdec1x
159 #if INTERLEAVE == 2
160         ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 ct blocks */
161         do_decrypt_block2x
162         st1             {v0.16b-v1.16b}, [x0], #32
163 #else
164         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
165         do_decrypt_block4x
166         st1             {v0.16b-v3.16b}, [x0], #64
167 #endif
168         b               .LecbdecloopNx
169 .Lecbdec1x:
170         adds            w4, w4, #INTERLEAVE
171         beq             .Lecbdecout
172 #endif
173 .Lecbdecloop:
174         ld1             {v0.16b}, [x1], #16             /* get next ct block */
175         decrypt_block   v0, w3, x2, x5, w6
176         st1             {v0.16b}, [x0], #16
177         subs            w4, w4, #1
178         bne             .Lecbdecloop
179 .Lecbdecout:
180         FRAME_POP
181         ret
182 AES_ENDPROC(aes_ecb_decrypt)
183
184
185         /*
186          * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
187          *                 int blocks, u8 iv[], int first)
188          * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
189          *                 int blocks, u8 iv[], int first)
190          */
191
192 AES_ENTRY(aes_cbc_encrypt)
193         cbz             w6, .Lcbcencloop
194
195         ld1             {v0.16b}, [x5]                  /* get iv */
196         enc_prepare     w3, x2, x6
197
198 .Lcbcencloop:
199         ld1             {v1.16b}, [x1], #16             /* get next pt block */
200         eor             v0.16b, v0.16b, v1.16b          /* ..and xor with iv */
201         encrypt_block   v0, w3, x2, x6, w7
202         st1             {v0.16b}, [x0], #16
203         subs            w4, w4, #1
204         bne             .Lcbcencloop
205         st1             {v0.16b}, [x5]                  /* return iv */
206         ret
207 AES_ENDPROC(aes_cbc_encrypt)
208
209
210 AES_ENTRY(aes_cbc_decrypt)
211         FRAME_PUSH
212         cbz             w6, .LcbcdecloopNx
213
214         ld1             {v7.16b}, [x5]                  /* get iv */
215         dec_prepare     w3, x2, x6
216
217 .LcbcdecloopNx:
218 #if INTERLEAVE >= 2
219         subs            w4, w4, #INTERLEAVE
220         bmi             .Lcbcdec1x
221 #if INTERLEAVE == 2
222         ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 ct blocks */
223         mov             v2.16b, v0.16b
224         mov             v3.16b, v1.16b
225         do_decrypt_block2x
226         eor             v0.16b, v0.16b, v7.16b
227         eor             v1.16b, v1.16b, v2.16b
228         mov             v7.16b, v3.16b
229         st1             {v0.16b-v1.16b}, [x0], #32
230 #else
231         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
232         mov             v4.16b, v0.16b
233         mov             v5.16b, v1.16b
234         mov             v6.16b, v2.16b
235         do_decrypt_block4x
236         sub             x1, x1, #16
237         eor             v0.16b, v0.16b, v7.16b
238         eor             v1.16b, v1.16b, v4.16b
239         ld1             {v7.16b}, [x1], #16             /* reload 1 ct block */
240         eor             v2.16b, v2.16b, v5.16b
241         eor             v3.16b, v3.16b, v6.16b
242         st1             {v0.16b-v3.16b}, [x0], #64
243 #endif
244         b               .LcbcdecloopNx
245 .Lcbcdec1x:
246         adds            w4, w4, #INTERLEAVE
247         beq             .Lcbcdecout
248 #endif
249 .Lcbcdecloop:
250         ld1             {v1.16b}, [x1], #16             /* get next ct block */
251         mov             v0.16b, v1.16b                  /* ...and copy to v0 */
252         decrypt_block   v0, w3, x2, x6, w7
253         eor             v0.16b, v0.16b, v7.16b          /* xor with iv => pt */
254         mov             v7.16b, v1.16b                  /* ct is next iv */
255         st1             {v0.16b}, [x0], #16
256         subs            w4, w4, #1
257         bne             .Lcbcdecloop
258 .Lcbcdecout:
259         FRAME_POP
260         st1             {v7.16b}, [x5]                  /* return iv */
261         ret
262 AES_ENDPROC(aes_cbc_decrypt)
263
264
265         /*
266          * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds,
267          *                 int blocks, u8 ctr[], int first)
268          */
269
270 AES_ENTRY(aes_ctr_encrypt)
271         FRAME_PUSH
272         cbz             w6, .Lctrnotfirst       /* 1st time around? */
273         enc_prepare     w3, x2, x6
274         ld1             {v4.16b}, [x5]
275
276 .Lctrnotfirst:
277         umov            x8, v4.d[1]             /* keep swabbed ctr in reg */
278         rev             x8, x8
279 #if INTERLEAVE >= 2
280         cmn             w8, w4                  /* 32 bit overflow? */
281         bcs             .Lctrloop
282 .LctrloopNx:
283         subs            w4, w4, #INTERLEAVE
284         bmi             .Lctr1x
285 #if INTERLEAVE == 2
286         mov             v0.8b, v4.8b
287         mov             v1.8b, v4.8b
288         rev             x7, x8
289         add             x8, x8, #1
290         ins             v0.d[1], x7
291         rev             x7, x8
292         add             x8, x8, #1
293         ins             v1.d[1], x7
294         ld1             {v2.16b-v3.16b}, [x1], #32      /* get 2 input blocks */
295         do_encrypt_block2x
296         eor             v0.16b, v0.16b, v2.16b
297         eor             v1.16b, v1.16b, v3.16b
298         st1             {v0.16b-v1.16b}, [x0], #32
299 #else
300         ldr             q8, =0x30000000200000001        /* addends 1,2,3[,0] */
301         dup             v7.4s, w8
302         mov             v0.16b, v4.16b
303         add             v7.4s, v7.4s, v8.4s
304         mov             v1.16b, v4.16b
305         rev32           v8.16b, v7.16b
306         mov             v2.16b, v4.16b
307         mov             v3.16b, v4.16b
308         mov             v1.s[3], v8.s[0]
309         mov             v2.s[3], v8.s[1]
310         mov             v3.s[3], v8.s[2]
311         ld1             {v5.16b-v7.16b}, [x1], #48      /* get 3 input blocks */
312         do_encrypt_block4x
313         eor             v0.16b, v5.16b, v0.16b
314         ld1             {v5.16b}, [x1], #16             /* get 1 input block  */
315         eor             v1.16b, v6.16b, v1.16b
316         eor             v2.16b, v7.16b, v2.16b
317         eor             v3.16b, v5.16b, v3.16b
318         st1             {v0.16b-v3.16b}, [x0], #64
319         add             x8, x8, #INTERLEAVE
320 #endif
321         rev             x7, x8
322         ins             v4.d[1], x7
323         cbz             w4, .Lctrout
324         b               .LctrloopNx
325 .Lctr1x:
326         adds            w4, w4, #INTERLEAVE
327         beq             .Lctrout
328 #endif
329 .Lctrloop:
330         mov             v0.16b, v4.16b
331         encrypt_block   v0, w3, x2, x6, w7
332
333         adds            x8, x8, #1              /* increment BE ctr */
334         rev             x7, x8
335         ins             v4.d[1], x7
336         bcs             .Lctrcarry              /* overflow? */
337
338 .Lctrcarrydone:
339         subs            w4, w4, #1
340         bmi             .Lctrhalfblock          /* blocks < 0 means 1/2 block */
341         ld1             {v3.16b}, [x1], #16
342         eor             v3.16b, v0.16b, v3.16b
343         st1             {v3.16b}, [x0], #16
344         bne             .Lctrloop
345
346 .Lctrout:
347         st1             {v4.16b}, [x5]          /* return next CTR value */
348         FRAME_POP
349         ret
350
351 .Lctrhalfblock:
352         ld1             {v3.8b}, [x1]
353         eor             v3.8b, v0.8b, v3.8b
354         st1             {v3.8b}, [x0]
355         FRAME_POP
356         ret
357
358 .Lctrcarry:
359         umov            x7, v4.d[0]             /* load upper word of ctr  */
360         rev             x7, x7                  /* ... to handle the carry */
361         add             x7, x7, #1
362         rev             x7, x7
363         ins             v4.d[0], x7
364         b               .Lctrcarrydone
365 AES_ENDPROC(aes_ctr_encrypt)
366         .ltorg
367
368
369         /*
370          * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
371          *                 int blocks, u8 const rk2[], u8 iv[], int first)
372          * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds,
373          *                 int blocks, u8 const rk2[], u8 iv[], int first)
374          */
375
376         .macro          next_tweak, out, in, const, tmp
377         sshr            \tmp\().2d,  \in\().2d,   #63
378         and             \tmp\().16b, \tmp\().16b, \const\().16b
379         add             \out\().2d,  \in\().2d,   \in\().2d
380         ext             \tmp\().16b, \tmp\().16b, \tmp\().16b, #8
381         eor             \out\().16b, \out\().16b, \tmp\().16b
382         .endm
383
384 .Lxts_mul_x:
385 CPU_LE( .quad           1, 0x87         )
386 CPU_BE( .quad           0x87, 1         )
387
388 AES_ENTRY(aes_xts_encrypt)
389         FRAME_PUSH
390         cbz             w7, .LxtsencloopNx
391
392         ld1             {v4.16b}, [x6]
393         enc_prepare     w3, x5, x6
394         encrypt_block   v4, w3, x5, x6, w7              /* first tweak */
395         enc_switch_key  w3, x2, x6
396         ldr             q7, .Lxts_mul_x
397         b               .LxtsencNx
398
399 .LxtsencloopNx:
400         ldr             q7, .Lxts_mul_x
401         next_tweak      v4, v4, v7, v8
402 .LxtsencNx:
403 #if INTERLEAVE >= 2
404         subs            w4, w4, #INTERLEAVE
405         bmi             .Lxtsenc1x
406 #if INTERLEAVE == 2
407         ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 pt blocks */
408         next_tweak      v5, v4, v7, v8
409         eor             v0.16b, v0.16b, v4.16b
410         eor             v1.16b, v1.16b, v5.16b
411         do_encrypt_block2x
412         eor             v0.16b, v0.16b, v4.16b
413         eor             v1.16b, v1.16b, v5.16b
414         st1             {v0.16b-v1.16b}, [x0], #32
415         cbz             w4, .LxtsencoutNx
416         next_tweak      v4, v5, v7, v8
417         b               .LxtsencNx
418 .LxtsencoutNx:
419         mov             v4.16b, v5.16b
420         b               .Lxtsencout
421 #else
422         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 pt blocks */
423         next_tweak      v5, v4, v7, v8
424         eor             v0.16b, v0.16b, v4.16b
425         next_tweak      v6, v5, v7, v8
426         eor             v1.16b, v1.16b, v5.16b
427         eor             v2.16b, v2.16b, v6.16b
428         next_tweak      v7, v6, v7, v8
429         eor             v3.16b, v3.16b, v7.16b
430         do_encrypt_block4x
431         eor             v3.16b, v3.16b, v7.16b
432         eor             v0.16b, v0.16b, v4.16b
433         eor             v1.16b, v1.16b, v5.16b
434         eor             v2.16b, v2.16b, v6.16b
435         st1             {v0.16b-v3.16b}, [x0], #64
436         mov             v4.16b, v7.16b
437         cbz             w4, .Lxtsencout
438         b               .LxtsencloopNx
439 #endif
440 .Lxtsenc1x:
441         adds            w4, w4, #INTERLEAVE
442         beq             .Lxtsencout
443 #endif
444 .Lxtsencloop:
445         ld1             {v1.16b}, [x1], #16
446         eor             v0.16b, v1.16b, v4.16b
447         encrypt_block   v0, w3, x2, x6, w7
448         eor             v0.16b, v0.16b, v4.16b
449         st1             {v0.16b}, [x0], #16
450         subs            w4, w4, #1
451         beq             .Lxtsencout
452         next_tweak      v4, v4, v7, v8
453         b               .Lxtsencloop
454 .Lxtsencout:
455         FRAME_POP
456         ret
457 AES_ENDPROC(aes_xts_encrypt)
458
459
460 AES_ENTRY(aes_xts_decrypt)
461         FRAME_PUSH
462         cbz             w7, .LxtsdecloopNx
463
464         ld1             {v4.16b}, [x6]
465         enc_prepare     w3, x5, x6
466         encrypt_block   v4, w3, x5, x6, w7              /* first tweak */
467         dec_prepare     w3, x2, x6
468         ldr             q7, .Lxts_mul_x
469         b               .LxtsdecNx
470
471 .LxtsdecloopNx:
472         ldr             q7, .Lxts_mul_x
473         next_tweak      v4, v4, v7, v8
474 .LxtsdecNx:
475 #if INTERLEAVE >= 2
476         subs            w4, w4, #INTERLEAVE
477         bmi             .Lxtsdec1x
478 #if INTERLEAVE == 2
479         ld1             {v0.16b-v1.16b}, [x1], #32      /* get 2 ct blocks */
480         next_tweak      v5, v4, v7, v8
481         eor             v0.16b, v0.16b, v4.16b
482         eor             v1.16b, v1.16b, v5.16b
483         do_decrypt_block2x
484         eor             v0.16b, v0.16b, v4.16b
485         eor             v1.16b, v1.16b, v5.16b
486         st1             {v0.16b-v1.16b}, [x0], #32
487         cbz             w4, .LxtsdecoutNx
488         next_tweak      v4, v5, v7, v8
489         b               .LxtsdecNx
490 .LxtsdecoutNx:
491         mov             v4.16b, v5.16b
492         b               .Lxtsdecout
493 #else
494         ld1             {v0.16b-v3.16b}, [x1], #64      /* get 4 ct blocks */
495         next_tweak      v5, v4, v7, v8
496         eor             v0.16b, v0.16b, v4.16b
497         next_tweak      v6, v5, v7, v8
498         eor             v1.16b, v1.16b, v5.16b
499         eor             v2.16b, v2.16b, v6.16b
500         next_tweak      v7, v6, v7, v8
501         eor             v3.16b, v3.16b, v7.16b
502         do_decrypt_block4x
503         eor             v3.16b, v3.16b, v7.16b
504         eor             v0.16b, v0.16b, v4.16b
505         eor             v1.16b, v1.16b, v5.16b
506         eor             v2.16b, v2.16b, v6.16b
507         st1             {v0.16b-v3.16b}, [x0], #64
508         mov             v4.16b, v7.16b
509         cbz             w4, .Lxtsdecout
510         b               .LxtsdecloopNx
511 #endif
512 .Lxtsdec1x:
513         adds            w4, w4, #INTERLEAVE
514         beq             .Lxtsdecout
515 #endif
516 .Lxtsdecloop:
517         ld1             {v1.16b}, [x1], #16
518         eor             v0.16b, v1.16b, v4.16b
519         decrypt_block   v0, w3, x2, x6, w7
520         eor             v0.16b, v0.16b, v4.16b
521         st1             {v0.16b}, [x0], #16
522         subs            w4, w4, #1
523         beq             .Lxtsdecout
524         next_tweak      v4, v4, v7, v8
525         b               .Lxtsdecloop
526 .Lxtsdecout:
527         FRAME_POP
528         ret
529 AES_ENDPROC(aes_xts_decrypt)