Linux-libre 5.4.49-gnu
[librecmc/linux-libre.git] / arch / x86 / include / asm / xor.h
1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 #ifndef _ASM_X86_XOR_H
3 #define _ASM_X86_XOR_H
4
5 /*
6  * Optimized RAID-5 checksumming functions for SSE.
7  */
8
9 /*
10  * Cache avoiding checksumming functions utilizing KNI instructions
11  * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
12  */
13
14 /*
15  * Based on
16  * High-speed RAID5 checksumming functions utilizing SSE instructions.
17  * Copyright (C) 1998 Ingo Molnar.
18  */
19
20 /*
21  * x86-64 changes / gcc fixes from Andi Kleen.
22  * Copyright 2002 Andi Kleen, SuSE Labs.
23  *
24  * This hasn't been optimized for the hammer yet, but there are likely
25  * no advantages to be gotten from x86-64 here anyways.
26  */
27
28 #include <asm/fpu/api.h>
29
30 #ifdef CONFIG_X86_32
31 /* reduce register pressure */
32 # define XOR_CONSTANT_CONSTRAINT "i"
33 #else
34 # define XOR_CONSTANT_CONSTRAINT "re"
35 #endif
36
37 #define OFFS(x)         "16*("#x")"
38 #define PF_OFFS(x)      "256+16*("#x")"
39 #define PF0(x)          "       prefetchnta "PF_OFFS(x)"(%[p1])         ;\n"
40 #define LD(x, y)        "       movaps "OFFS(x)"(%[p1]), %%xmm"#y"      ;\n"
41 #define ST(x, y)        "       movaps %%xmm"#y", "OFFS(x)"(%[p1])      ;\n"
42 #define PF1(x)          "       prefetchnta "PF_OFFS(x)"(%[p2])         ;\n"
43 #define PF2(x)          "       prefetchnta "PF_OFFS(x)"(%[p3])         ;\n"
44 #define PF3(x)          "       prefetchnta "PF_OFFS(x)"(%[p4])         ;\n"
45 #define PF4(x)          "       prefetchnta "PF_OFFS(x)"(%[p5])         ;\n"
46 #define XO1(x, y)       "       xorps "OFFS(x)"(%[p2]), %%xmm"#y"       ;\n"
47 #define XO2(x, y)       "       xorps "OFFS(x)"(%[p3]), %%xmm"#y"       ;\n"
48 #define XO3(x, y)       "       xorps "OFFS(x)"(%[p4]), %%xmm"#y"       ;\n"
49 #define XO4(x, y)       "       xorps "OFFS(x)"(%[p5]), %%xmm"#y"       ;\n"
50 #define NOP(x)
51
52 #define BLK64(pf, op, i)                                \
53                 pf(i)                                   \
54                 op(i, 0)                                \
55                         op(i + 1, 1)                    \
56                                 op(i + 2, 2)            \
57                                         op(i + 3, 3)
58
59 static void
60 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
61 {
62         unsigned long lines = bytes >> 8;
63
64         kernel_fpu_begin();
65
66         asm volatile(
67 #undef BLOCK
68 #define BLOCK(i)                                        \
69                 LD(i, 0)                                \
70                         LD(i + 1, 1)                    \
71                 PF1(i)                                  \
72                                 PF1(i + 2)              \
73                                 LD(i + 2, 2)            \
74                                         LD(i + 3, 3)    \
75                 PF0(i + 4)                              \
76                                 PF0(i + 6)              \
77                 XO1(i, 0)                               \
78                         XO1(i + 1, 1)                   \
79                                 XO1(i + 2, 2)           \
80                                         XO1(i + 3, 3)   \
81                 ST(i, 0)                                \
82                         ST(i + 1, 1)                    \
83                                 ST(i + 2, 2)            \
84                                         ST(i + 3, 3)    \
85
86
87                 PF0(0)
88                                 PF0(2)
89
90         " .align 32                     ;\n"
91         " 1:                            ;\n"
92
93                 BLOCK(0)
94                 BLOCK(4)
95                 BLOCK(8)
96                 BLOCK(12)
97
98         "       add %[inc], %[p1]       ;\n"
99         "       add %[inc], %[p2]       ;\n"
100         "       dec %[cnt]              ;\n"
101         "       jnz 1b                  ;\n"
102         : [cnt] "+r" (lines),
103           [p1] "+r" (p1), [p2] "+r" (p2)
104         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
105         : "memory");
106
107         kernel_fpu_end();
108 }
109
110 static void
111 xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
112 {
113         unsigned long lines = bytes >> 8;
114
115         kernel_fpu_begin();
116
117         asm volatile(
118 #undef BLOCK
119 #define BLOCK(i)                        \
120                 BLK64(PF0, LD, i)       \
121                 BLK64(PF1, XO1, i)      \
122                 BLK64(NOP, ST, i)       \
123
124         " .align 32                     ;\n"
125         " 1:                            ;\n"
126
127                 BLOCK(0)
128                 BLOCK(4)
129                 BLOCK(8)
130                 BLOCK(12)
131
132         "       add %[inc], %[p1]       ;\n"
133         "       add %[inc], %[p2]       ;\n"
134         "       dec %[cnt]              ;\n"
135         "       jnz 1b                  ;\n"
136         : [cnt] "+r" (lines),
137           [p1] "+r" (p1), [p2] "+r" (p2)
138         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
139         : "memory");
140
141         kernel_fpu_end();
142 }
143
144 static void
145 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
146           unsigned long *p3)
147 {
148         unsigned long lines = bytes >> 8;
149
150         kernel_fpu_begin();
151
152         asm volatile(
153 #undef BLOCK
154 #define BLOCK(i) \
155                 PF1(i)                                  \
156                                 PF1(i + 2)              \
157                 LD(i, 0)                                \
158                         LD(i + 1, 1)                    \
159                                 LD(i + 2, 2)            \
160                                         LD(i + 3, 3)    \
161                 PF2(i)                                  \
162                                 PF2(i + 2)              \
163                 PF0(i + 4)                              \
164                                 PF0(i + 6)              \
165                 XO1(i, 0)                               \
166                         XO1(i + 1, 1)                   \
167                                 XO1(i + 2, 2)           \
168                                         XO1(i + 3, 3)   \
169                 XO2(i, 0)                               \
170                         XO2(i + 1, 1)                   \
171                                 XO2(i + 2, 2)           \
172                                         XO2(i + 3, 3)   \
173                 ST(i, 0)                                \
174                         ST(i + 1, 1)                    \
175                                 ST(i + 2, 2)            \
176                                         ST(i + 3, 3)    \
177
178
179                 PF0(0)
180                                 PF0(2)
181
182         " .align 32                     ;\n"
183         " 1:                            ;\n"
184
185                 BLOCK(0)
186                 BLOCK(4)
187                 BLOCK(8)
188                 BLOCK(12)
189
190         "       add %[inc], %[p1]       ;\n"
191         "       add %[inc], %[p2]       ;\n"
192         "       add %[inc], %[p3]       ;\n"
193         "       dec %[cnt]              ;\n"
194         "       jnz 1b                  ;\n"
195         : [cnt] "+r" (lines),
196           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
197         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
198         : "memory");
199
200         kernel_fpu_end();
201 }
202
203 static void
204 xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
205                unsigned long *p3)
206 {
207         unsigned long lines = bytes >> 8;
208
209         kernel_fpu_begin();
210
211         asm volatile(
212 #undef BLOCK
213 #define BLOCK(i)                        \
214                 BLK64(PF0, LD, i)       \
215                 BLK64(PF1, XO1, i)      \
216                 BLK64(PF2, XO2, i)      \
217                 BLK64(NOP, ST, i)       \
218
219         " .align 32                     ;\n"
220         " 1:                            ;\n"
221
222                 BLOCK(0)
223                 BLOCK(4)
224                 BLOCK(8)
225                 BLOCK(12)
226
227         "       add %[inc], %[p1]       ;\n"
228         "       add %[inc], %[p2]       ;\n"
229         "       add %[inc], %[p3]       ;\n"
230         "       dec %[cnt]              ;\n"
231         "       jnz 1b                  ;\n"
232         : [cnt] "+r" (lines),
233           [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
234         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
235         : "memory");
236
237         kernel_fpu_end();
238 }
239
240 static void
241 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
242           unsigned long *p3, unsigned long *p4)
243 {
244         unsigned long lines = bytes >> 8;
245
246         kernel_fpu_begin();
247
248         asm volatile(
249 #undef BLOCK
250 #define BLOCK(i) \
251                 PF1(i)                                  \
252                                 PF1(i + 2)              \
253                 LD(i, 0)                                \
254                         LD(i + 1, 1)                    \
255                                 LD(i + 2, 2)            \
256                                         LD(i + 3, 3)    \
257                 PF2(i)                                  \
258                                 PF2(i + 2)              \
259                 XO1(i, 0)                               \
260                         XO1(i + 1, 1)                   \
261                                 XO1(i + 2, 2)           \
262                                         XO1(i + 3, 3)   \
263                 PF3(i)                                  \
264                                 PF3(i + 2)              \
265                 PF0(i + 4)                              \
266                                 PF0(i + 6)              \
267                 XO2(i, 0)                               \
268                         XO2(i + 1, 1)                   \
269                                 XO2(i + 2, 2)           \
270                                         XO2(i + 3, 3)   \
271                 XO3(i, 0)                               \
272                         XO3(i + 1, 1)                   \
273                                 XO3(i + 2, 2)           \
274                                         XO3(i + 3, 3)   \
275                 ST(i, 0)                                \
276                         ST(i + 1, 1)                    \
277                                 ST(i + 2, 2)            \
278                                         ST(i + 3, 3)    \
279
280
281                 PF0(0)
282                                 PF0(2)
283
284         " .align 32                     ;\n"
285         " 1:                            ;\n"
286
287                 BLOCK(0)
288                 BLOCK(4)
289                 BLOCK(8)
290                 BLOCK(12)
291
292         "       add %[inc], %[p1]       ;\n"
293         "       add %[inc], %[p2]       ;\n"
294         "       add %[inc], %[p3]       ;\n"
295         "       add %[inc], %[p4]       ;\n"
296         "       dec %[cnt]              ;\n"
297         "       jnz 1b                  ;\n"
298         : [cnt] "+r" (lines), [p1] "+r" (p1),
299           [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
300         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
301         : "memory");
302
303         kernel_fpu_end();
304 }
305
306 static void
307 xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
308                unsigned long *p3, unsigned long *p4)
309 {
310         unsigned long lines = bytes >> 8;
311
312         kernel_fpu_begin();
313
314         asm volatile(
315 #undef BLOCK
316 #define BLOCK(i)                        \
317                 BLK64(PF0, LD, i)       \
318                 BLK64(PF1, XO1, i)      \
319                 BLK64(PF2, XO2, i)      \
320                 BLK64(PF3, XO3, i)      \
321                 BLK64(NOP, ST, i)       \
322
323         " .align 32                     ;\n"
324         " 1:                            ;\n"
325
326                 BLOCK(0)
327                 BLOCK(4)
328                 BLOCK(8)
329                 BLOCK(12)
330
331         "       add %[inc], %[p1]       ;\n"
332         "       add %[inc], %[p2]       ;\n"
333         "       add %[inc], %[p3]       ;\n"
334         "       add %[inc], %[p4]       ;\n"
335         "       dec %[cnt]              ;\n"
336         "       jnz 1b                  ;\n"
337         : [cnt] "+r" (lines), [p1] "+r" (p1),
338           [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
339         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
340         : "memory");
341
342         kernel_fpu_end();
343 }
344
345 static void
346 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
347           unsigned long *p3, unsigned long *p4, unsigned long *p5)
348 {
349         unsigned long lines = bytes >> 8;
350
351         kernel_fpu_begin();
352
353         asm volatile(
354 #undef BLOCK
355 #define BLOCK(i) \
356                 PF1(i)                                  \
357                                 PF1(i + 2)              \
358                 LD(i, 0)                                \
359                         LD(i + 1, 1)                    \
360                                 LD(i + 2, 2)            \
361                                         LD(i + 3, 3)    \
362                 PF2(i)                                  \
363                                 PF2(i + 2)              \
364                 XO1(i, 0)                               \
365                         XO1(i + 1, 1)                   \
366                                 XO1(i + 2, 2)           \
367                                         XO1(i + 3, 3)   \
368                 PF3(i)                                  \
369                                 PF3(i + 2)              \
370                 XO2(i, 0)                               \
371                         XO2(i + 1, 1)                   \
372                                 XO2(i + 2, 2)           \
373                                         XO2(i + 3, 3)   \
374                 PF4(i)                                  \
375                                 PF4(i + 2)              \
376                 PF0(i + 4)                              \
377                                 PF0(i + 6)              \
378                 XO3(i, 0)                               \
379                         XO3(i + 1, 1)                   \
380                                 XO3(i + 2, 2)           \
381                                         XO3(i + 3, 3)   \
382                 XO4(i, 0)                               \
383                         XO4(i + 1, 1)                   \
384                                 XO4(i + 2, 2)           \
385                                         XO4(i + 3, 3)   \
386                 ST(i, 0)                                \
387                         ST(i + 1, 1)                    \
388                                 ST(i + 2, 2)            \
389                                         ST(i + 3, 3)    \
390
391
392                 PF0(0)
393                                 PF0(2)
394
395         " .align 32                     ;\n"
396         " 1:                            ;\n"
397
398                 BLOCK(0)
399                 BLOCK(4)
400                 BLOCK(8)
401                 BLOCK(12)
402
403         "       add %[inc], %[p1]       ;\n"
404         "       add %[inc], %[p2]       ;\n"
405         "       add %[inc], %[p3]       ;\n"
406         "       add %[inc], %[p4]       ;\n"
407         "       add %[inc], %[p5]       ;\n"
408         "       dec %[cnt]              ;\n"
409         "       jnz 1b                  ;\n"
410         : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
411           [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
412         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
413         : "memory");
414
415         kernel_fpu_end();
416 }
417
418 static void
419 xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
420                unsigned long *p3, unsigned long *p4, unsigned long *p5)
421 {
422         unsigned long lines = bytes >> 8;
423
424         kernel_fpu_begin();
425
426         asm volatile(
427 #undef BLOCK
428 #define BLOCK(i)                        \
429                 BLK64(PF0, LD, i)       \
430                 BLK64(PF1, XO1, i)      \
431                 BLK64(PF2, XO2, i)      \
432                 BLK64(PF3, XO3, i)      \
433                 BLK64(PF4, XO4, i)      \
434                 BLK64(NOP, ST, i)       \
435
436         " .align 32                     ;\n"
437         " 1:                            ;\n"
438
439                 BLOCK(0)
440                 BLOCK(4)
441                 BLOCK(8)
442                 BLOCK(12)
443
444         "       add %[inc], %[p1]       ;\n"
445         "       add %[inc], %[p2]       ;\n"
446         "       add %[inc], %[p3]       ;\n"
447         "       add %[inc], %[p4]       ;\n"
448         "       add %[inc], %[p5]       ;\n"
449         "       dec %[cnt]              ;\n"
450         "       jnz 1b                  ;\n"
451         : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
452           [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
453         : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
454         : "memory");
455
456         kernel_fpu_end();
457 }
458
459 static struct xor_block_template xor_block_sse_pf64 = {
460         .name = "prefetch64-sse",
461         .do_2 = xor_sse_2_pf64,
462         .do_3 = xor_sse_3_pf64,
463         .do_4 = xor_sse_4_pf64,
464         .do_5 = xor_sse_5_pf64,
465 };
466
467 #undef LD
468 #undef XO1
469 #undef XO2
470 #undef XO3
471 #undef XO4
472 #undef ST
473 #undef NOP
474 #undef BLK64
475 #undef BLOCK
476
477 #undef XOR_CONSTANT_CONSTRAINT
478
479 #ifdef CONFIG_X86_32
480 # include <asm/xor_32.h>
481 #else
482 # include <asm/xor_64.h>
483 #endif
484
485 #define XOR_SELECT_TEMPLATE(FASTEST) \
486         AVX_SELECT(FASTEST)
487
488 #endif /* _ASM_X86_XOR_H */