Linux-libre 5.4.47-gnu
[librecmc/linux-libre.git] / arch / x86 / crypto / chacha-avx512vl-x86_64.S
1 /* SPDX-License-Identifier: GPL-2.0+ */
2 /*
3  * ChaCha 256-bit cipher algorithm, x64 AVX-512VL functions
4  *
5  * Copyright (C) 2018 Martin Willi
6  */
7
8 #include <linux/linkage.h>
9
10 .section        .rodata.cst32.CTR2BL, "aM", @progbits, 32
11 .align 32
12 CTR2BL: .octa 0x00000000000000000000000000000000
13         .octa 0x00000000000000000000000000000001
14
15 .section        .rodata.cst32.CTR4BL, "aM", @progbits, 32
16 .align 32
17 CTR4BL: .octa 0x00000000000000000000000000000002
18         .octa 0x00000000000000000000000000000003
19
20 .section        .rodata.cst32.CTR8BL, "aM", @progbits, 32
21 .align 32
22 CTR8BL: .octa 0x00000003000000020000000100000000
23         .octa 0x00000007000000060000000500000004
24
25 .text
26
27 ENTRY(chacha_2block_xor_avx512vl)
28         # %rdi: Input state matrix, s
29         # %rsi: up to 2 data blocks output, o
30         # %rdx: up to 2 data blocks input, i
31         # %rcx: input/output length in bytes
32         # %r8d: nrounds
33
34         # This function encrypts two ChaCha blocks by loading the state
35         # matrix twice across four AVX registers. It performs matrix operations
36         # on four words in each matrix in parallel, but requires shuffling to
37         # rearrange the words after each round.
38
39         vzeroupper
40
41         # x0..3[0-2] = s0..3
42         vbroadcasti128  0x00(%rdi),%ymm0
43         vbroadcasti128  0x10(%rdi),%ymm1
44         vbroadcasti128  0x20(%rdi),%ymm2
45         vbroadcasti128  0x30(%rdi),%ymm3
46
47         vpaddd          CTR2BL(%rip),%ymm3,%ymm3
48
49         vmovdqa         %ymm0,%ymm8
50         vmovdqa         %ymm1,%ymm9
51         vmovdqa         %ymm2,%ymm10
52         vmovdqa         %ymm3,%ymm11
53
54 .Ldoubleround:
55
56         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
57         vpaddd          %ymm1,%ymm0,%ymm0
58         vpxord          %ymm0,%ymm3,%ymm3
59         vprold          $16,%ymm3,%ymm3
60
61         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
62         vpaddd          %ymm3,%ymm2,%ymm2
63         vpxord          %ymm2,%ymm1,%ymm1
64         vprold          $12,%ymm1,%ymm1
65
66         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
67         vpaddd          %ymm1,%ymm0,%ymm0
68         vpxord          %ymm0,%ymm3,%ymm3
69         vprold          $8,%ymm3,%ymm3
70
71         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
72         vpaddd          %ymm3,%ymm2,%ymm2
73         vpxord          %ymm2,%ymm1,%ymm1
74         vprold          $7,%ymm1,%ymm1
75
76         # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
77         vpshufd         $0x39,%ymm1,%ymm1
78         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
79         vpshufd         $0x4e,%ymm2,%ymm2
80         # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
81         vpshufd         $0x93,%ymm3,%ymm3
82
83         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
84         vpaddd          %ymm1,%ymm0,%ymm0
85         vpxord          %ymm0,%ymm3,%ymm3
86         vprold          $16,%ymm3,%ymm3
87
88         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
89         vpaddd          %ymm3,%ymm2,%ymm2
90         vpxord          %ymm2,%ymm1,%ymm1
91         vprold          $12,%ymm1,%ymm1
92
93         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
94         vpaddd          %ymm1,%ymm0,%ymm0
95         vpxord          %ymm0,%ymm3,%ymm3
96         vprold          $8,%ymm3,%ymm3
97
98         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
99         vpaddd          %ymm3,%ymm2,%ymm2
100         vpxord          %ymm2,%ymm1,%ymm1
101         vprold          $7,%ymm1,%ymm1
102
103         # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
104         vpshufd         $0x93,%ymm1,%ymm1
105         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
106         vpshufd         $0x4e,%ymm2,%ymm2
107         # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
108         vpshufd         $0x39,%ymm3,%ymm3
109
110         sub             $2,%r8d
111         jnz             .Ldoubleround
112
113         # o0 = i0 ^ (x0 + s0)
114         vpaddd          %ymm8,%ymm0,%ymm7
115         cmp             $0x10,%rcx
116         jl              .Lxorpart2
117         vpxord          0x00(%rdx),%xmm7,%xmm6
118         vmovdqu         %xmm6,0x00(%rsi)
119         vextracti128    $1,%ymm7,%xmm0
120         # o1 = i1 ^ (x1 + s1)
121         vpaddd          %ymm9,%ymm1,%ymm7
122         cmp             $0x20,%rcx
123         jl              .Lxorpart2
124         vpxord          0x10(%rdx),%xmm7,%xmm6
125         vmovdqu         %xmm6,0x10(%rsi)
126         vextracti128    $1,%ymm7,%xmm1
127         # o2 = i2 ^ (x2 + s2)
128         vpaddd          %ymm10,%ymm2,%ymm7
129         cmp             $0x30,%rcx
130         jl              .Lxorpart2
131         vpxord          0x20(%rdx),%xmm7,%xmm6
132         vmovdqu         %xmm6,0x20(%rsi)
133         vextracti128    $1,%ymm7,%xmm2
134         # o3 = i3 ^ (x3 + s3)
135         vpaddd          %ymm11,%ymm3,%ymm7
136         cmp             $0x40,%rcx
137         jl              .Lxorpart2
138         vpxord          0x30(%rdx),%xmm7,%xmm6
139         vmovdqu         %xmm6,0x30(%rsi)
140         vextracti128    $1,%ymm7,%xmm3
141
142         # xor and write second block
143         vmovdqa         %xmm0,%xmm7
144         cmp             $0x50,%rcx
145         jl              .Lxorpart2
146         vpxord          0x40(%rdx),%xmm7,%xmm6
147         vmovdqu         %xmm6,0x40(%rsi)
148
149         vmovdqa         %xmm1,%xmm7
150         cmp             $0x60,%rcx
151         jl              .Lxorpart2
152         vpxord          0x50(%rdx),%xmm7,%xmm6
153         vmovdqu         %xmm6,0x50(%rsi)
154
155         vmovdqa         %xmm2,%xmm7
156         cmp             $0x70,%rcx
157         jl              .Lxorpart2
158         vpxord          0x60(%rdx),%xmm7,%xmm6
159         vmovdqu         %xmm6,0x60(%rsi)
160
161         vmovdqa         %xmm3,%xmm7
162         cmp             $0x80,%rcx
163         jl              .Lxorpart2
164         vpxord          0x70(%rdx),%xmm7,%xmm6
165         vmovdqu         %xmm6,0x70(%rsi)
166
167 .Ldone2:
168         vzeroupper
169         ret
170
171 .Lxorpart2:
172         # xor remaining bytes from partial register into output
173         mov             %rcx,%rax
174         and             $0xf,%rcx
175         jz              .Ldone8
176         mov             %rax,%r9
177         and             $~0xf,%r9
178
179         mov             $1,%rax
180         shld            %cl,%rax,%rax
181         sub             $1,%rax
182         kmovq           %rax,%k1
183
184         vmovdqu8        (%rdx,%r9),%xmm1{%k1}{z}
185         vpxord          %xmm7,%xmm1,%xmm1
186         vmovdqu8        %xmm1,(%rsi,%r9){%k1}
187
188         jmp             .Ldone2
189
190 ENDPROC(chacha_2block_xor_avx512vl)
191
192 ENTRY(chacha_4block_xor_avx512vl)
193         # %rdi: Input state matrix, s
194         # %rsi: up to 4 data blocks output, o
195         # %rdx: up to 4 data blocks input, i
196         # %rcx: input/output length in bytes
197         # %r8d: nrounds
198
199         # This function encrypts four ChaCha blocks by loading the state
200         # matrix four times across eight AVX registers. It performs matrix
201         # operations on four words in two matrices in parallel, sequentially
202         # to the operations on the four words of the other two matrices. The
203         # required word shuffling has a rather high latency, we can do the
204         # arithmetic on two matrix-pairs without much slowdown.
205
206         vzeroupper
207
208         # x0..3[0-4] = s0..3
209         vbroadcasti128  0x00(%rdi),%ymm0
210         vbroadcasti128  0x10(%rdi),%ymm1
211         vbroadcasti128  0x20(%rdi),%ymm2
212         vbroadcasti128  0x30(%rdi),%ymm3
213
214         vmovdqa         %ymm0,%ymm4
215         vmovdqa         %ymm1,%ymm5
216         vmovdqa         %ymm2,%ymm6
217         vmovdqa         %ymm3,%ymm7
218
219         vpaddd          CTR2BL(%rip),%ymm3,%ymm3
220         vpaddd          CTR4BL(%rip),%ymm7,%ymm7
221
222         vmovdqa         %ymm0,%ymm11
223         vmovdqa         %ymm1,%ymm12
224         vmovdqa         %ymm2,%ymm13
225         vmovdqa         %ymm3,%ymm14
226         vmovdqa         %ymm7,%ymm15
227
228 .Ldoubleround4:
229
230         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
231         vpaddd          %ymm1,%ymm0,%ymm0
232         vpxord          %ymm0,%ymm3,%ymm3
233         vprold          $16,%ymm3,%ymm3
234
235         vpaddd          %ymm5,%ymm4,%ymm4
236         vpxord          %ymm4,%ymm7,%ymm7
237         vprold          $16,%ymm7,%ymm7
238
239         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
240         vpaddd          %ymm3,%ymm2,%ymm2
241         vpxord          %ymm2,%ymm1,%ymm1
242         vprold          $12,%ymm1,%ymm1
243
244         vpaddd          %ymm7,%ymm6,%ymm6
245         vpxord          %ymm6,%ymm5,%ymm5
246         vprold          $12,%ymm5,%ymm5
247
248         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
249         vpaddd          %ymm1,%ymm0,%ymm0
250         vpxord          %ymm0,%ymm3,%ymm3
251         vprold          $8,%ymm3,%ymm3
252
253         vpaddd          %ymm5,%ymm4,%ymm4
254         vpxord          %ymm4,%ymm7,%ymm7
255         vprold          $8,%ymm7,%ymm7
256
257         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
258         vpaddd          %ymm3,%ymm2,%ymm2
259         vpxord          %ymm2,%ymm1,%ymm1
260         vprold          $7,%ymm1,%ymm1
261
262         vpaddd          %ymm7,%ymm6,%ymm6
263         vpxord          %ymm6,%ymm5,%ymm5
264         vprold          $7,%ymm5,%ymm5
265
266         # x1 = shuffle32(x1, MASK(0, 3, 2, 1))
267         vpshufd         $0x39,%ymm1,%ymm1
268         vpshufd         $0x39,%ymm5,%ymm5
269         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
270         vpshufd         $0x4e,%ymm2,%ymm2
271         vpshufd         $0x4e,%ymm6,%ymm6
272         # x3 = shuffle32(x3, MASK(2, 1, 0, 3))
273         vpshufd         $0x93,%ymm3,%ymm3
274         vpshufd         $0x93,%ymm7,%ymm7
275
276         # x0 += x1, x3 = rotl32(x3 ^ x0, 16)
277         vpaddd          %ymm1,%ymm0,%ymm0
278         vpxord          %ymm0,%ymm3,%ymm3
279         vprold          $16,%ymm3,%ymm3
280
281         vpaddd          %ymm5,%ymm4,%ymm4
282         vpxord          %ymm4,%ymm7,%ymm7
283         vprold          $16,%ymm7,%ymm7
284
285         # x2 += x3, x1 = rotl32(x1 ^ x2, 12)
286         vpaddd          %ymm3,%ymm2,%ymm2
287         vpxord          %ymm2,%ymm1,%ymm1
288         vprold          $12,%ymm1,%ymm1
289
290         vpaddd          %ymm7,%ymm6,%ymm6
291         vpxord          %ymm6,%ymm5,%ymm5
292         vprold          $12,%ymm5,%ymm5
293
294         # x0 += x1, x3 = rotl32(x3 ^ x0, 8)
295         vpaddd          %ymm1,%ymm0,%ymm0
296         vpxord          %ymm0,%ymm3,%ymm3
297         vprold          $8,%ymm3,%ymm3
298
299         vpaddd          %ymm5,%ymm4,%ymm4
300         vpxord          %ymm4,%ymm7,%ymm7
301         vprold          $8,%ymm7,%ymm7
302
303         # x2 += x3, x1 = rotl32(x1 ^ x2, 7)
304         vpaddd          %ymm3,%ymm2,%ymm2
305         vpxord          %ymm2,%ymm1,%ymm1
306         vprold          $7,%ymm1,%ymm1
307
308         vpaddd          %ymm7,%ymm6,%ymm6
309         vpxord          %ymm6,%ymm5,%ymm5
310         vprold          $7,%ymm5,%ymm5
311
312         # x1 = shuffle32(x1, MASK(2, 1, 0, 3))
313         vpshufd         $0x93,%ymm1,%ymm1
314         vpshufd         $0x93,%ymm5,%ymm5
315         # x2 = shuffle32(x2, MASK(1, 0, 3, 2))
316         vpshufd         $0x4e,%ymm2,%ymm2
317         vpshufd         $0x4e,%ymm6,%ymm6
318         # x3 = shuffle32(x3, MASK(0, 3, 2, 1))
319         vpshufd         $0x39,%ymm3,%ymm3
320         vpshufd         $0x39,%ymm7,%ymm7
321
322         sub             $2,%r8d
323         jnz             .Ldoubleround4
324
325         # o0 = i0 ^ (x0 + s0), first block
326         vpaddd          %ymm11,%ymm0,%ymm10
327         cmp             $0x10,%rcx
328         jl              .Lxorpart4
329         vpxord          0x00(%rdx),%xmm10,%xmm9
330         vmovdqu         %xmm9,0x00(%rsi)
331         vextracti128    $1,%ymm10,%xmm0
332         # o1 = i1 ^ (x1 + s1), first block
333         vpaddd          %ymm12,%ymm1,%ymm10
334         cmp             $0x20,%rcx
335         jl              .Lxorpart4
336         vpxord          0x10(%rdx),%xmm10,%xmm9
337         vmovdqu         %xmm9,0x10(%rsi)
338         vextracti128    $1,%ymm10,%xmm1
339         # o2 = i2 ^ (x2 + s2), first block
340         vpaddd          %ymm13,%ymm2,%ymm10
341         cmp             $0x30,%rcx
342         jl              .Lxorpart4
343         vpxord          0x20(%rdx),%xmm10,%xmm9
344         vmovdqu         %xmm9,0x20(%rsi)
345         vextracti128    $1,%ymm10,%xmm2
346         # o3 = i3 ^ (x3 + s3), first block
347         vpaddd          %ymm14,%ymm3,%ymm10
348         cmp             $0x40,%rcx
349         jl              .Lxorpart4
350         vpxord          0x30(%rdx),%xmm10,%xmm9
351         vmovdqu         %xmm9,0x30(%rsi)
352         vextracti128    $1,%ymm10,%xmm3
353
354         # xor and write second block
355         vmovdqa         %xmm0,%xmm10
356         cmp             $0x50,%rcx
357         jl              .Lxorpart4
358         vpxord          0x40(%rdx),%xmm10,%xmm9
359         vmovdqu         %xmm9,0x40(%rsi)
360
361         vmovdqa         %xmm1,%xmm10
362         cmp             $0x60,%rcx
363         jl              .Lxorpart4
364         vpxord          0x50(%rdx),%xmm10,%xmm9
365         vmovdqu         %xmm9,0x50(%rsi)
366
367         vmovdqa         %xmm2,%xmm10
368         cmp             $0x70,%rcx
369         jl              .Lxorpart4
370         vpxord          0x60(%rdx),%xmm10,%xmm9
371         vmovdqu         %xmm9,0x60(%rsi)
372
373         vmovdqa         %xmm3,%xmm10
374         cmp             $0x80,%rcx
375         jl              .Lxorpart4
376         vpxord          0x70(%rdx),%xmm10,%xmm9
377         vmovdqu         %xmm9,0x70(%rsi)
378
379         # o0 = i0 ^ (x0 + s0), third block
380         vpaddd          %ymm11,%ymm4,%ymm10
381         cmp             $0x90,%rcx
382         jl              .Lxorpart4
383         vpxord          0x80(%rdx),%xmm10,%xmm9
384         vmovdqu         %xmm9,0x80(%rsi)
385         vextracti128    $1,%ymm10,%xmm4
386         # o1 = i1 ^ (x1 + s1), third block
387         vpaddd          %ymm12,%ymm5,%ymm10
388         cmp             $0xa0,%rcx
389         jl              .Lxorpart4
390         vpxord          0x90(%rdx),%xmm10,%xmm9
391         vmovdqu         %xmm9,0x90(%rsi)
392         vextracti128    $1,%ymm10,%xmm5
393         # o2 = i2 ^ (x2 + s2), third block
394         vpaddd          %ymm13,%ymm6,%ymm10
395         cmp             $0xb0,%rcx
396         jl              .Lxorpart4
397         vpxord          0xa0(%rdx),%xmm10,%xmm9
398         vmovdqu         %xmm9,0xa0(%rsi)
399         vextracti128    $1,%ymm10,%xmm6
400         # o3 = i3 ^ (x3 + s3), third block
401         vpaddd          %ymm15,%ymm7,%ymm10
402         cmp             $0xc0,%rcx
403         jl              .Lxorpart4
404         vpxord          0xb0(%rdx),%xmm10,%xmm9
405         vmovdqu         %xmm9,0xb0(%rsi)
406         vextracti128    $1,%ymm10,%xmm7
407
408         # xor and write fourth block
409         vmovdqa         %xmm4,%xmm10
410         cmp             $0xd0,%rcx
411         jl              .Lxorpart4
412         vpxord          0xc0(%rdx),%xmm10,%xmm9
413         vmovdqu         %xmm9,0xc0(%rsi)
414
415         vmovdqa         %xmm5,%xmm10
416         cmp             $0xe0,%rcx
417         jl              .Lxorpart4
418         vpxord          0xd0(%rdx),%xmm10,%xmm9
419         vmovdqu         %xmm9,0xd0(%rsi)
420
421         vmovdqa         %xmm6,%xmm10
422         cmp             $0xf0,%rcx
423         jl              .Lxorpart4
424         vpxord          0xe0(%rdx),%xmm10,%xmm9
425         vmovdqu         %xmm9,0xe0(%rsi)
426
427         vmovdqa         %xmm7,%xmm10
428         cmp             $0x100,%rcx
429         jl              .Lxorpart4
430         vpxord          0xf0(%rdx),%xmm10,%xmm9
431         vmovdqu         %xmm9,0xf0(%rsi)
432
433 .Ldone4:
434         vzeroupper
435         ret
436
437 .Lxorpart4:
438         # xor remaining bytes from partial register into output
439         mov             %rcx,%rax
440         and             $0xf,%rcx
441         jz              .Ldone8
442         mov             %rax,%r9
443         and             $~0xf,%r9
444
445         mov             $1,%rax
446         shld            %cl,%rax,%rax
447         sub             $1,%rax
448         kmovq           %rax,%k1
449
450         vmovdqu8        (%rdx,%r9),%xmm1{%k1}{z}
451         vpxord          %xmm10,%xmm1,%xmm1
452         vmovdqu8        %xmm1,(%rsi,%r9){%k1}
453
454         jmp             .Ldone4
455
456 ENDPROC(chacha_4block_xor_avx512vl)
457
458 ENTRY(chacha_8block_xor_avx512vl)
459         # %rdi: Input state matrix, s
460         # %rsi: up to 8 data blocks output, o
461         # %rdx: up to 8 data blocks input, i
462         # %rcx: input/output length in bytes
463         # %r8d: nrounds
464
465         # This function encrypts eight consecutive ChaCha blocks by loading
466         # the state matrix in AVX registers eight times. Compared to AVX2, this
467         # mostly benefits from the new rotate instructions in VL and the
468         # additional registers.
469
470         vzeroupper
471
472         # x0..15[0-7] = s[0..15]
473         vpbroadcastd    0x00(%rdi),%ymm0
474         vpbroadcastd    0x04(%rdi),%ymm1
475         vpbroadcastd    0x08(%rdi),%ymm2
476         vpbroadcastd    0x0c(%rdi),%ymm3
477         vpbroadcastd    0x10(%rdi),%ymm4
478         vpbroadcastd    0x14(%rdi),%ymm5
479         vpbroadcastd    0x18(%rdi),%ymm6
480         vpbroadcastd    0x1c(%rdi),%ymm7
481         vpbroadcastd    0x20(%rdi),%ymm8
482         vpbroadcastd    0x24(%rdi),%ymm9
483         vpbroadcastd    0x28(%rdi),%ymm10
484         vpbroadcastd    0x2c(%rdi),%ymm11
485         vpbroadcastd    0x30(%rdi),%ymm12
486         vpbroadcastd    0x34(%rdi),%ymm13
487         vpbroadcastd    0x38(%rdi),%ymm14
488         vpbroadcastd    0x3c(%rdi),%ymm15
489
490         # x12 += counter values 0-3
491         vpaddd          CTR8BL(%rip),%ymm12,%ymm12
492
493         vmovdqa64       %ymm0,%ymm16
494         vmovdqa64       %ymm1,%ymm17
495         vmovdqa64       %ymm2,%ymm18
496         vmovdqa64       %ymm3,%ymm19
497         vmovdqa64       %ymm4,%ymm20
498         vmovdqa64       %ymm5,%ymm21
499         vmovdqa64       %ymm6,%ymm22
500         vmovdqa64       %ymm7,%ymm23
501         vmovdqa64       %ymm8,%ymm24
502         vmovdqa64       %ymm9,%ymm25
503         vmovdqa64       %ymm10,%ymm26
504         vmovdqa64       %ymm11,%ymm27
505         vmovdqa64       %ymm12,%ymm28
506         vmovdqa64       %ymm13,%ymm29
507         vmovdqa64       %ymm14,%ymm30
508         vmovdqa64       %ymm15,%ymm31
509
510 .Ldoubleround8:
511         # x0 += x4, x12 = rotl32(x12 ^ x0, 16)
512         vpaddd          %ymm0,%ymm4,%ymm0
513         vpxord          %ymm0,%ymm12,%ymm12
514         vprold          $16,%ymm12,%ymm12
515         # x1 += x5, x13 = rotl32(x13 ^ x1, 16)
516         vpaddd          %ymm1,%ymm5,%ymm1
517         vpxord          %ymm1,%ymm13,%ymm13
518         vprold          $16,%ymm13,%ymm13
519         # x2 += x6, x14 = rotl32(x14 ^ x2, 16)
520         vpaddd          %ymm2,%ymm6,%ymm2
521         vpxord          %ymm2,%ymm14,%ymm14
522         vprold          $16,%ymm14,%ymm14
523         # x3 += x7, x15 = rotl32(x15 ^ x3, 16)
524         vpaddd          %ymm3,%ymm7,%ymm3
525         vpxord          %ymm3,%ymm15,%ymm15
526         vprold          $16,%ymm15,%ymm15
527
528         # x8 += x12, x4 = rotl32(x4 ^ x8, 12)
529         vpaddd          %ymm12,%ymm8,%ymm8
530         vpxord          %ymm8,%ymm4,%ymm4
531         vprold          $12,%ymm4,%ymm4
532         # x9 += x13, x5 = rotl32(x5 ^ x9, 12)
533         vpaddd          %ymm13,%ymm9,%ymm9
534         vpxord          %ymm9,%ymm5,%ymm5
535         vprold          $12,%ymm5,%ymm5
536         # x10 += x14, x6 = rotl32(x6 ^ x10, 12)
537         vpaddd          %ymm14,%ymm10,%ymm10
538         vpxord          %ymm10,%ymm6,%ymm6
539         vprold          $12,%ymm6,%ymm6
540         # x11 += x15, x7 = rotl32(x7 ^ x11, 12)
541         vpaddd          %ymm15,%ymm11,%ymm11
542         vpxord          %ymm11,%ymm7,%ymm7
543         vprold          $12,%ymm7,%ymm7
544
545         # x0 += x4, x12 = rotl32(x12 ^ x0, 8)
546         vpaddd          %ymm0,%ymm4,%ymm0
547         vpxord          %ymm0,%ymm12,%ymm12
548         vprold          $8,%ymm12,%ymm12
549         # x1 += x5, x13 = rotl32(x13 ^ x1, 8)
550         vpaddd          %ymm1,%ymm5,%ymm1
551         vpxord          %ymm1,%ymm13,%ymm13
552         vprold          $8,%ymm13,%ymm13
553         # x2 += x6, x14 = rotl32(x14 ^ x2, 8)
554         vpaddd          %ymm2,%ymm6,%ymm2
555         vpxord          %ymm2,%ymm14,%ymm14
556         vprold          $8,%ymm14,%ymm14
557         # x3 += x7, x15 = rotl32(x15 ^ x3, 8)
558         vpaddd          %ymm3,%ymm7,%ymm3
559         vpxord          %ymm3,%ymm15,%ymm15
560         vprold          $8,%ymm15,%ymm15
561
562         # x8 += x12, x4 = rotl32(x4 ^ x8, 7)
563         vpaddd          %ymm12,%ymm8,%ymm8
564         vpxord          %ymm8,%ymm4,%ymm4
565         vprold          $7,%ymm4,%ymm4
566         # x9 += x13, x5 = rotl32(x5 ^ x9, 7)
567         vpaddd          %ymm13,%ymm9,%ymm9
568         vpxord          %ymm9,%ymm5,%ymm5
569         vprold          $7,%ymm5,%ymm5
570         # x10 += x14, x6 = rotl32(x6 ^ x10, 7)
571         vpaddd          %ymm14,%ymm10,%ymm10
572         vpxord          %ymm10,%ymm6,%ymm6
573         vprold          $7,%ymm6,%ymm6
574         # x11 += x15, x7 = rotl32(x7 ^ x11, 7)
575         vpaddd          %ymm15,%ymm11,%ymm11
576         vpxord          %ymm11,%ymm7,%ymm7
577         vprold          $7,%ymm7,%ymm7
578
579         # x0 += x5, x15 = rotl32(x15 ^ x0, 16)
580         vpaddd          %ymm0,%ymm5,%ymm0
581         vpxord          %ymm0,%ymm15,%ymm15
582         vprold          $16,%ymm15,%ymm15
583         # x1 += x6, x12 = rotl32(x12 ^ x1, 16)
584         vpaddd          %ymm1,%ymm6,%ymm1
585         vpxord          %ymm1,%ymm12,%ymm12
586         vprold          $16,%ymm12,%ymm12
587         # x2 += x7, x13 = rotl32(x13 ^ x2, 16)
588         vpaddd          %ymm2,%ymm7,%ymm2
589         vpxord          %ymm2,%ymm13,%ymm13
590         vprold          $16,%ymm13,%ymm13
591         # x3 += x4, x14 = rotl32(x14 ^ x3, 16)
592         vpaddd          %ymm3,%ymm4,%ymm3
593         vpxord          %ymm3,%ymm14,%ymm14
594         vprold          $16,%ymm14,%ymm14
595
596         # x10 += x15, x5 = rotl32(x5 ^ x10, 12)
597         vpaddd          %ymm15,%ymm10,%ymm10
598         vpxord          %ymm10,%ymm5,%ymm5
599         vprold          $12,%ymm5,%ymm5
600         # x11 += x12, x6 = rotl32(x6 ^ x11, 12)
601         vpaddd          %ymm12,%ymm11,%ymm11
602         vpxord          %ymm11,%ymm6,%ymm6
603         vprold          $12,%ymm6,%ymm6
604         # x8 += x13, x7 = rotl32(x7 ^ x8, 12)
605         vpaddd          %ymm13,%ymm8,%ymm8
606         vpxord          %ymm8,%ymm7,%ymm7
607         vprold          $12,%ymm7,%ymm7
608         # x9 += x14, x4 = rotl32(x4 ^ x9, 12)
609         vpaddd          %ymm14,%ymm9,%ymm9
610         vpxord          %ymm9,%ymm4,%ymm4
611         vprold          $12,%ymm4,%ymm4
612
613         # x0 += x5, x15 = rotl32(x15 ^ x0, 8)
614         vpaddd          %ymm0,%ymm5,%ymm0
615         vpxord          %ymm0,%ymm15,%ymm15
616         vprold          $8,%ymm15,%ymm15
617         # x1 += x6, x12 = rotl32(x12 ^ x1, 8)
618         vpaddd          %ymm1,%ymm6,%ymm1
619         vpxord          %ymm1,%ymm12,%ymm12
620         vprold          $8,%ymm12,%ymm12
621         # x2 += x7, x13 = rotl32(x13 ^ x2, 8)
622         vpaddd          %ymm2,%ymm7,%ymm2
623         vpxord          %ymm2,%ymm13,%ymm13
624         vprold          $8,%ymm13,%ymm13
625         # x3 += x4, x14 = rotl32(x14 ^ x3, 8)
626         vpaddd          %ymm3,%ymm4,%ymm3
627         vpxord          %ymm3,%ymm14,%ymm14
628         vprold          $8,%ymm14,%ymm14
629
630         # x10 += x15, x5 = rotl32(x5 ^ x10, 7)
631         vpaddd          %ymm15,%ymm10,%ymm10
632         vpxord          %ymm10,%ymm5,%ymm5
633         vprold          $7,%ymm5,%ymm5
634         # x11 += x12, x6 = rotl32(x6 ^ x11, 7)
635         vpaddd          %ymm12,%ymm11,%ymm11
636         vpxord          %ymm11,%ymm6,%ymm6
637         vprold          $7,%ymm6,%ymm6
638         # x8 += x13, x7 = rotl32(x7 ^ x8, 7)
639         vpaddd          %ymm13,%ymm8,%ymm8
640         vpxord          %ymm8,%ymm7,%ymm7
641         vprold          $7,%ymm7,%ymm7
642         # x9 += x14, x4 = rotl32(x4 ^ x9, 7)
643         vpaddd          %ymm14,%ymm9,%ymm9
644         vpxord          %ymm9,%ymm4,%ymm4
645         vprold          $7,%ymm4,%ymm4
646
647         sub             $2,%r8d
648         jnz             .Ldoubleround8
649
650         # x0..15[0-3] += s[0..15]
651         vpaddd          %ymm16,%ymm0,%ymm0
652         vpaddd          %ymm17,%ymm1,%ymm1
653         vpaddd          %ymm18,%ymm2,%ymm2
654         vpaddd          %ymm19,%ymm3,%ymm3
655         vpaddd          %ymm20,%ymm4,%ymm4
656         vpaddd          %ymm21,%ymm5,%ymm5
657         vpaddd          %ymm22,%ymm6,%ymm6
658         vpaddd          %ymm23,%ymm7,%ymm7
659         vpaddd          %ymm24,%ymm8,%ymm8
660         vpaddd          %ymm25,%ymm9,%ymm9
661         vpaddd          %ymm26,%ymm10,%ymm10
662         vpaddd          %ymm27,%ymm11,%ymm11
663         vpaddd          %ymm28,%ymm12,%ymm12
664         vpaddd          %ymm29,%ymm13,%ymm13
665         vpaddd          %ymm30,%ymm14,%ymm14
666         vpaddd          %ymm31,%ymm15,%ymm15
667
668         # interleave 32-bit words in state n, n+1
669         vpunpckldq      %ymm1,%ymm0,%ymm16
670         vpunpckhdq      %ymm1,%ymm0,%ymm17
671         vpunpckldq      %ymm3,%ymm2,%ymm18
672         vpunpckhdq      %ymm3,%ymm2,%ymm19
673         vpunpckldq      %ymm5,%ymm4,%ymm20
674         vpunpckhdq      %ymm5,%ymm4,%ymm21
675         vpunpckldq      %ymm7,%ymm6,%ymm22
676         vpunpckhdq      %ymm7,%ymm6,%ymm23
677         vpunpckldq      %ymm9,%ymm8,%ymm24
678         vpunpckhdq      %ymm9,%ymm8,%ymm25
679         vpunpckldq      %ymm11,%ymm10,%ymm26
680         vpunpckhdq      %ymm11,%ymm10,%ymm27
681         vpunpckldq      %ymm13,%ymm12,%ymm28
682         vpunpckhdq      %ymm13,%ymm12,%ymm29
683         vpunpckldq      %ymm15,%ymm14,%ymm30
684         vpunpckhdq      %ymm15,%ymm14,%ymm31
685
686         # interleave 64-bit words in state n, n+2
687         vpunpcklqdq     %ymm18,%ymm16,%ymm0
688         vpunpcklqdq     %ymm19,%ymm17,%ymm1
689         vpunpckhqdq     %ymm18,%ymm16,%ymm2
690         vpunpckhqdq     %ymm19,%ymm17,%ymm3
691         vpunpcklqdq     %ymm22,%ymm20,%ymm4
692         vpunpcklqdq     %ymm23,%ymm21,%ymm5
693         vpunpckhqdq     %ymm22,%ymm20,%ymm6
694         vpunpckhqdq     %ymm23,%ymm21,%ymm7
695         vpunpcklqdq     %ymm26,%ymm24,%ymm8
696         vpunpcklqdq     %ymm27,%ymm25,%ymm9
697         vpunpckhqdq     %ymm26,%ymm24,%ymm10
698         vpunpckhqdq     %ymm27,%ymm25,%ymm11
699         vpunpcklqdq     %ymm30,%ymm28,%ymm12
700         vpunpcklqdq     %ymm31,%ymm29,%ymm13
701         vpunpckhqdq     %ymm30,%ymm28,%ymm14
702         vpunpckhqdq     %ymm31,%ymm29,%ymm15
703
704         # interleave 128-bit words in state n, n+4
705         # xor/write first four blocks
706         vmovdqa64       %ymm0,%ymm16
707         vperm2i128      $0x20,%ymm4,%ymm0,%ymm0
708         cmp             $0x0020,%rcx
709         jl              .Lxorpart8
710         vpxord          0x0000(%rdx),%ymm0,%ymm0
711         vmovdqu64       %ymm0,0x0000(%rsi)
712         vmovdqa64       %ymm16,%ymm0
713         vperm2i128      $0x31,%ymm4,%ymm0,%ymm4
714
715         vperm2i128      $0x20,%ymm12,%ymm8,%ymm0
716         cmp             $0x0040,%rcx
717         jl              .Lxorpart8
718         vpxord          0x0020(%rdx),%ymm0,%ymm0
719         vmovdqu64       %ymm0,0x0020(%rsi)
720         vperm2i128      $0x31,%ymm12,%ymm8,%ymm12
721
722         vperm2i128      $0x20,%ymm6,%ymm2,%ymm0
723         cmp             $0x0060,%rcx
724         jl              .Lxorpart8
725         vpxord          0x0040(%rdx),%ymm0,%ymm0
726         vmovdqu64       %ymm0,0x0040(%rsi)
727         vperm2i128      $0x31,%ymm6,%ymm2,%ymm6
728
729         vperm2i128      $0x20,%ymm14,%ymm10,%ymm0
730         cmp             $0x0080,%rcx
731         jl              .Lxorpart8
732         vpxord          0x0060(%rdx),%ymm0,%ymm0
733         vmovdqu64       %ymm0,0x0060(%rsi)
734         vperm2i128      $0x31,%ymm14,%ymm10,%ymm14
735
736         vperm2i128      $0x20,%ymm5,%ymm1,%ymm0
737         cmp             $0x00a0,%rcx
738         jl              .Lxorpart8
739         vpxord          0x0080(%rdx),%ymm0,%ymm0
740         vmovdqu64       %ymm0,0x0080(%rsi)
741         vperm2i128      $0x31,%ymm5,%ymm1,%ymm5
742
743         vperm2i128      $0x20,%ymm13,%ymm9,%ymm0
744         cmp             $0x00c0,%rcx
745         jl              .Lxorpart8
746         vpxord          0x00a0(%rdx),%ymm0,%ymm0
747         vmovdqu64       %ymm0,0x00a0(%rsi)
748         vperm2i128      $0x31,%ymm13,%ymm9,%ymm13
749
750         vperm2i128      $0x20,%ymm7,%ymm3,%ymm0
751         cmp             $0x00e0,%rcx
752         jl              .Lxorpart8
753         vpxord          0x00c0(%rdx),%ymm0,%ymm0
754         vmovdqu64       %ymm0,0x00c0(%rsi)
755         vperm2i128      $0x31,%ymm7,%ymm3,%ymm7
756
757         vperm2i128      $0x20,%ymm15,%ymm11,%ymm0
758         cmp             $0x0100,%rcx
759         jl              .Lxorpart8
760         vpxord          0x00e0(%rdx),%ymm0,%ymm0
761         vmovdqu64       %ymm0,0x00e0(%rsi)
762         vperm2i128      $0x31,%ymm15,%ymm11,%ymm15
763
764         # xor remaining blocks, write to output
765         vmovdqa64       %ymm4,%ymm0
766         cmp             $0x0120,%rcx
767         jl              .Lxorpart8
768         vpxord          0x0100(%rdx),%ymm0,%ymm0
769         vmovdqu64       %ymm0,0x0100(%rsi)
770
771         vmovdqa64       %ymm12,%ymm0
772         cmp             $0x0140,%rcx
773         jl              .Lxorpart8
774         vpxord          0x0120(%rdx),%ymm0,%ymm0
775         vmovdqu64       %ymm0,0x0120(%rsi)
776
777         vmovdqa64       %ymm6,%ymm0
778         cmp             $0x0160,%rcx
779         jl              .Lxorpart8
780         vpxord          0x0140(%rdx),%ymm0,%ymm0
781         vmovdqu64       %ymm0,0x0140(%rsi)
782
783         vmovdqa64       %ymm14,%ymm0
784         cmp             $0x0180,%rcx
785         jl              .Lxorpart8
786         vpxord          0x0160(%rdx),%ymm0,%ymm0
787         vmovdqu64       %ymm0,0x0160(%rsi)
788
789         vmovdqa64       %ymm5,%ymm0
790         cmp             $0x01a0,%rcx
791         jl              .Lxorpart8
792         vpxord          0x0180(%rdx),%ymm0,%ymm0
793         vmovdqu64       %ymm0,0x0180(%rsi)
794
795         vmovdqa64       %ymm13,%ymm0
796         cmp             $0x01c0,%rcx
797         jl              .Lxorpart8
798         vpxord          0x01a0(%rdx),%ymm0,%ymm0
799         vmovdqu64       %ymm0,0x01a0(%rsi)
800
801         vmovdqa64       %ymm7,%ymm0
802         cmp             $0x01e0,%rcx
803         jl              .Lxorpart8
804         vpxord          0x01c0(%rdx),%ymm0,%ymm0
805         vmovdqu64       %ymm0,0x01c0(%rsi)
806
807         vmovdqa64       %ymm15,%ymm0
808         cmp             $0x0200,%rcx
809         jl              .Lxorpart8
810         vpxord          0x01e0(%rdx),%ymm0,%ymm0
811         vmovdqu64       %ymm0,0x01e0(%rsi)
812
813 .Ldone8:
814         vzeroupper
815         ret
816
817 .Lxorpart8:
818         # xor remaining bytes from partial register into output
819         mov             %rcx,%rax
820         and             $0x1f,%rcx
821         jz              .Ldone8
822         mov             %rax,%r9
823         and             $~0x1f,%r9
824
825         mov             $1,%rax
826         shld            %cl,%rax,%rax
827         sub             $1,%rax
828         kmovq           %rax,%k1
829
830         vmovdqu8        (%rdx,%r9),%ymm1{%k1}{z}
831         vpxord          %ymm0,%ymm1,%ymm1
832         vmovdqu8        %ymm1,(%rsi,%r9){%k1}
833
834         jmp             .Ldone8
835
836 ENDPROC(chacha_8block_xor_avx512vl)