2 // Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
4 // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
6 // This program is free software; you can redistribute it and/or modify
7 // it under the terms of the GNU General Public License version 2 as
8 // published by the Free Software Foundation.
12 // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
14 // Copyright (c) 2013, Intel Corporation
17 // Erdinc Ozturk <erdinc.ozturk@intel.com>
18 // Vinodh Gopal <vinodh.gopal@intel.com>
19 // James Guilford <james.guilford@intel.com>
20 // Tim Chen <tim.c.chen@linux.intel.com>
22 // This software is available to you under a choice of one of two
23 // licenses. You may choose to be licensed under the terms of the GNU
24 // General Public License (GPL) Version 2, available from the file
25 // COPYING in the main directory of this source tree, or the
26 // OpenIB.org BSD license below:
28 // Redistribution and use in source and binary forms, with or without
29 // modification, are permitted provided that the following conditions are
32 // * Redistributions of source code must retain the above copyright
33 // notice, this list of conditions and the following disclaimer.
35 // * Redistributions in binary form must reproduce the above copyright
36 // notice, this list of conditions and the following disclaimer in the
37 // documentation and/or other materials provided with the
40 // * Neither the name of the Intel Corporation nor the names of its
41 // contributors may be used to endorse or promote products derived from
42 // this software without specific prior written permission.
45 // THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
46 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
48 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
49 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
50 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
51 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
52 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
53 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
54 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
55 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
58 // UINT16 crc_t10dif_pcl(
59 // UINT16 init_crc, //initial CRC value, 16 bits
60 // const unsigned char *buf, //buffer pointer to calculate CRC on
61 // UINT64 len //buffer length in bytes (64-bit data)
64 // Reference paper titled "Fast CRC Computation for Generic
65 // Polynomials Using PCLMULQDQ Instruction"
66 // URL: http://www.intel.com/content/dam/www/public/us/en/documents
67 // /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
71 #include <linux/linkage.h>
72 #include <asm/assembler.h>
107 .macro __pmull_init_p64
110 .macro __pmull_pre_p64, bd
113 .macro __pmull_init_p8
114 // k00_16 := 0x0000000000000000_000000000000ffff
115 // k32_48 := 0x00000000ffffffff_0000ffffffffffff
116 movi k32_48.2d, #0xffffffff
117 mov k32_48.h[2], k32_48.h[0]
118 ushr k00_16.2d, k32_48.2d, #32
120 // prepare the permutation vectors
121 mov_q x5, 0x080f0e0d0c0b0a09
124 eor perm1.16b, perm1.16b, perm4.16b
125 ushr perm2.2d, perm1.2d, #8
126 ushr perm3.2d, perm1.2d, #16
127 ushr perm4.2d, perm1.2d, #24
128 sli perm2.2d, perm1.2d, #56
129 sli perm3.2d, perm1.2d, #48
130 sli perm4.2d, perm1.2d, #40
133 .macro __pmull_pre_p8, bd
134 tbl bd1.16b, {\bd\().16b}, perm1.16b
135 tbl bd2.16b, {\bd\().16b}, perm2.16b
136 tbl bd3.16b, {\bd\().16b}, perm3.16b
137 tbl bd4.16b, {\bd\().16b}, perm4.16b
142 ext t4.8b, ad.8b, ad.8b, #1 // A1
143 ext t5.8b, ad.8b, ad.8b, #2 // A2
144 ext t6.8b, ad.8b, ad.8b, #3 // A3
146 pmull t4.8h, t4.8b, bd.8b // F = A1*B
147 pmull t8.8h, ad.8b, bd1.8b // E = A*B1
148 pmull t5.8h, t5.8b, bd.8b // H = A2*B
149 pmull t7.8h, ad.8b, bd2.8b // G = A*B2
150 pmull t6.8h, t6.8b, bd.8b // J = A3*B
151 pmull t9.8h, ad.8b, bd3.8b // I = A*B3
152 pmull t3.8h, ad.8b, bd4.8b // K = A*B4
156 tbl t4.16b, {ad.16b}, perm1.16b // A1
157 tbl t5.16b, {ad.16b}, perm2.16b // A2
158 tbl t6.16b, {ad.16b}, perm3.16b // A3
160 pmull2 t4.8h, t4.16b, bd.16b // F = A1*B
161 pmull2 t8.8h, ad.16b, bd1.16b // E = A*B1
162 pmull2 t5.8h, t5.16b, bd.16b // H = A2*B
163 pmull2 t7.8h, ad.16b, bd2.16b // G = A*B2
164 pmull2 t6.8h, t6.16b, bd.16b // J = A3*B
165 pmull2 t9.8h, ad.16b, bd3.16b // I = A*B3
166 pmull2 t3.8h, ad.16b, bd4.16b // K = A*B4
168 0: eor t4.16b, t4.16b, t8.16b // L = E + F
169 eor t5.16b, t5.16b, t7.16b // M = G + H
170 eor t6.16b, t6.16b, t9.16b // N = I + J
172 uzp1 t8.2d, t4.2d, t5.2d
173 uzp2 t4.2d, t4.2d, t5.2d
174 uzp1 t7.2d, t6.2d, t3.2d
175 uzp2 t6.2d, t6.2d, t3.2d
177 // t4 = (L) (P0 + P1) << 8
178 // t5 = (M) (P2 + P3) << 16
179 eor t8.16b, t8.16b, t4.16b
180 and t4.16b, t4.16b, k32_48.16b
182 // t6 = (N) (P4 + P5) << 24
183 // t7 = (K) (P6 + P7) << 32
184 eor t7.16b, t7.16b, t6.16b
185 and t6.16b, t6.16b, k00_16.16b
187 eor t8.16b, t8.16b, t4.16b
188 eor t7.16b, t7.16b, t6.16b
190 zip2 t5.2d, t8.2d, t4.2d
191 zip1 t4.2d, t8.2d, t4.2d
192 zip2 t3.2d, t7.2d, t6.2d
193 zip1 t6.2d, t7.2d, t6.2d
195 ext t4.16b, t4.16b, t4.16b, #15
196 ext t5.16b, t5.16b, t5.16b, #14
197 ext t6.16b, t6.16b, t6.16b, #13
198 ext t3.16b, t3.16b, t3.16b, #12
200 eor t4.16b, t4.16b, t5.16b
201 eor t6.16b, t6.16b, t3.16b
203 ENDPROC(__pmull_p8_core)
205 .macro __pmull_p8, rq, ad, bd, i
209 mov ad.16b, \ad\().16b
211 pmull \rq\().8h, \ad\().8b, bd.8b // D = A*B
213 pmull2 \rq\().8h, \ad\().16b, bd.16b // D = A*B
216 bl .L__pmull_p8_core\i
218 eor \rq\().16b, \rq\().16b, t4.16b
219 eor \rq\().16b, \rq\().16b, t6.16b
222 .macro fold64, p, reg1, reg2
223 ldp q11, q12, [arg2], #0x20
225 __pmull_\p v8, \reg1, v10, 2
226 __pmull_\p \reg1, \reg1, v10
228 CPU_LE( rev64 v11.16b, v11.16b )
229 CPU_LE( rev64 v12.16b, v12.16b )
231 __pmull_\p v9, \reg2, v10, 2
232 __pmull_\p \reg2, \reg2, v10
234 CPU_LE( ext v11.16b, v11.16b, v11.16b, #8 )
235 CPU_LE( ext v12.16b, v12.16b, v12.16b, #8 )
237 eor \reg1\().16b, \reg1\().16b, v8.16b
238 eor \reg2\().16b, \reg2\().16b, v9.16b
239 eor \reg1\().16b, \reg1\().16b, v11.16b
240 eor \reg2\().16b, \reg2\().16b, v12.16b
243 .macro fold16, p, reg, rk
244 __pmull_\p v8, \reg, v10
245 __pmull_\p \reg, \reg, v10, 2
250 eor v7.16b, v7.16b, v8.16b
251 eor v7.16b, v7.16b, \reg\().16b
254 .macro __pmull_p64, rd, rn, rm, n
256 pmull \rd\().1q, \rn\().1d, \rm\().1d
258 pmull2 \rd\().1q, \rn\().2d, \rm\().2d
262 .macro crc_t10dif_pmull, p
269 movi vzr.16b, #0 // init zero register
273 // adjust the 16-bit initial_crc value, scale it to 32 bits
274 lsl arg1_low32, arg1_low32, #16
276 // check if smaller than 256
279 // for sizes less than 128, we can't fold 64B at a time...
280 b.lt .L_less_than_128_\@
282 // load the initial crc value
283 // crc value does not need to be byte-reflected, but it needs
284 // to be moved to the high part of the register.
285 // because data will be byte-reflected and will align with
286 // initial crc at correct place.
288 mov v10.s[3], arg1_low32 // initial crc
290 // receive the initial 64B data, xor the initial crc value
292 ldp q2, q3, [arg2, #0x20]
293 ldp q4, q5, [arg2, #0x40]
294 ldp q6, q7, [arg2, #0x60]
295 add arg2, arg2, #0x80
297 CPU_LE( rev64 v0.16b, v0.16b )
298 CPU_LE( rev64 v1.16b, v1.16b )
299 CPU_LE( rev64 v2.16b, v2.16b )
300 CPU_LE( rev64 v3.16b, v3.16b )
301 CPU_LE( rev64 v4.16b, v4.16b )
302 CPU_LE( rev64 v5.16b, v5.16b )
303 CPU_LE( rev64 v6.16b, v6.16b )
304 CPU_LE( rev64 v7.16b, v7.16b )
306 CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
307 CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
308 CPU_LE( ext v2.16b, v2.16b, v2.16b, #8 )
309 CPU_LE( ext v3.16b, v3.16b, v3.16b, #8 )
310 CPU_LE( ext v4.16b, v4.16b, v4.16b, #8 )
311 CPU_LE( ext v5.16b, v5.16b, v5.16b, #8 )
312 CPU_LE( ext v6.16b, v6.16b, v6.16b, #8 )
313 CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
315 // XOR the initial_crc value
316 eor v0.16b, v0.16b, v10.16b
318 ldr_l q10, rk3, x8 // xmm10 has rk3 and rk4
319 // type of pmull instruction
320 // will determine which constant to use
324 // we subtract 256 instead of 128 to save one instruction from the loop
328 // at this section of the code, there is 64*x+y (0<=y<64) bytes of
329 // buffer. The _fold_64_B_loop will fold 64B at a time
330 // until we have 64+y Bytes of buffer
332 // fold 64B at a time. This section of the code folds 4 vector
333 // registers in parallel
334 .L_fold_64_B_loop_\@:
341 subs arg3, arg3, #128
343 // check if there is another 64B in the buffer to be able to fold
344 b.lt .L_fold_64_B_end_\@
346 if_will_cond_yield_neon
347 stp q0, q1, [sp, #.Lframe_local_offset]
348 stp q2, q3, [sp, #.Lframe_local_offset + 32]
349 stp q4, q5, [sp, #.Lframe_local_offset + 64]
350 stp q6, q7, [sp, #.Lframe_local_offset + 96]
352 ldp q0, q1, [sp, #.Lframe_local_offset]
353 ldp q2, q3, [sp, #.Lframe_local_offset + 32]
354 ldp q4, q5, [sp, #.Lframe_local_offset + 64]
355 ldp q6, q7, [sp, #.Lframe_local_offset + 96]
357 movi vzr.16b, #0 // init zero register
362 b .L_fold_64_B_loop_\@
365 // at this point, the buffer pointer is pointing at the last y Bytes
366 // of the buffer the 64B of folded data is in 4 of the vector
367 // registers: v0, v1, v2, v3
369 // fold the 8 vector registers to 1 vector register with different
383 // instead of 64, we add 48 to the loop counter to save 1 instruction
384 // from the loop instead of a cmp instruction, we use the negative
385 // flag with the jl instruction
386 adds arg3, arg3, #(128-16)
387 b.lt .L_final_reduction_for_128_\@
389 // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
390 // and the rest is in memory. We can fold 16 bytes at a time if y>=16
391 // continue folding 16B at a time
393 .L_16B_reduction_loop_\@:
394 __pmull_\p v8, v7, v10
395 __pmull_\p v7, v7, v10, 2
396 eor v7.16b, v7.16b, v8.16b
399 CPU_LE( rev64 v0.16b, v0.16b )
400 CPU_LE( ext v0.16b, v0.16b, v0.16b, #8 )
401 eor v7.16b, v7.16b, v0.16b
404 // instead of a cmp instruction, we utilize the flags with the
405 // jge instruction equivalent of: cmp arg3, 16-16
406 // check if there is any more 16B in the buffer to be able to fold
407 b.ge .L_16B_reduction_loop_\@
409 // now we have 16+z bytes left to reduce, where 0<= z < 16.
410 // first, we reduce the data in the xmm7 register
412 .L_final_reduction_for_128_\@:
413 // check if any more data to fold. If not, compute the CRC of
414 // the final 128 bits
418 // here we are getting data that is less than 16 bytes.
419 // since we know that there was data before the pointer, we can
420 // offset the input pointer before the actual point, to receive
421 // exactly 16 bytes. after that the registers need to be adjusted.
422 .L_get_last_two_regs_\@:
425 CPU_LE( rev64 v1.16b, v1.16b )
426 CPU_LE( ext v1.16b, v1.16b, v1.16b, #8 )
428 // get rid of the extra data that was loaded before
429 // load the shift constant
430 adr_l x4, tbl_shf_table + 16
434 // shift v2 to the left by arg3 bytes
435 tbl v2.16b, {v7.16b}, v0.16b
437 // shift v7 to the right by 16-arg3 bytes
439 eor v0.16b, v0.16b, v9.16b
440 tbl v7.16b, {v7.16b}, v0.16b
443 sshr v0.16b, v0.16b, #7 // convert to 8-bit mask
444 bsl v0.16b, v2.16b, v1.16b
447 __pmull_\p v8, v7, v10
448 __pmull_\p v7, v7, v10, 2
449 eor v7.16b, v7.16b, v8.16b
450 eor v7.16b, v7.16b, v0.16b
453 // compute crc of a 128-bit value
454 ldr_l q10, rk5, x8 // rk5 and rk6 in xmm10
458 ext v0.16b, vzr.16b, v7.16b, #8
460 __pmull_\p v7, v7, v10
461 eor v7.16b, v7.16b, v0.16b
464 ext v0.16b, v7.16b, vzr.16b, #4
465 mov v7.s[3], vzr.s[0]
466 __pmull_\p v0, v0, v10, 2
467 eor v7.16b, v7.16b, v0.16b
474 __pmull_\p v0, v0, v10
475 ext v0.16b, vzr.16b, v0.16b, #12
476 __pmull_\p v0, v0, v10, 2
477 ext v0.16b, vzr.16b, v0.16b, #12
478 eor v7.16b, v7.16b, v0.16b
482 // scale the result back to 16 bits
488 cbz arg3, .L_cleanup_\@
491 mov v0.s[3], arg1_low32 // get the initial crc value
493 ldr q7, [arg2], #0x10
494 CPU_LE( rev64 v7.16b, v7.16b )
495 CPU_LE( ext v7.16b, v7.16b, v7.16b, #8 )
496 eor v7.16b, v7.16b, v0.16b // xor the initial crc value
499 b.eq .L_128_done_\@ // exactly 16 left
500 b.lt .L_less_than_16_left_\@
502 ldr_l q10, rk1, x8 // rk1 and rk2 in xmm10
505 // update the counter. subtract 32 instead of 16 to save one
506 // instruction from the loop
508 b.ge .L_16B_reduction_loop_\@
511 b .L_get_last_two_regs_\@
513 .L_less_than_16_left_\@:
515 adr_l x0, tbl_shf_table + 16
519 eor v0.16b, v0.16b, v9.16b
520 tbl v7.16b, {v7.16b}, v0.16b
524 ENTRY(crc_t10dif_pmull_p8)
526 ENDPROC(crc_t10dif_pmull_p8)
529 ENTRY(crc_t10dif_pmull_p64)
531 ENDPROC(crc_t10dif_pmull_p64)
533 // precomputed constants
534 // these constants are precomputed from the poly:
535 // 0x8bb70000 (0x8bb7 scaled to 32 bits)
536 .section ".rodata", "a"
539 // rk1 = 2^(32*3) mod Q << 32
540 // rk2 = 2^(32*5) mod Q << 32
541 // rk3 = 2^(32*15) mod Q << 32
542 // rk4 = 2^(32*17) mod Q << 32
543 // rk5 = 2^(32*3) mod Q << 32
544 // rk6 = 2^(32*2) mod Q << 32
545 // rk7 = floor(2^64/Q)
548 rk1: .octa 0x06df0000000000002d56000000000000
549 rk3: .octa 0x7cf50000000000009d9d000000000000
550 rk5: .octa 0x13680000000000002d56000000000000
551 rk7: .octa 0x000000018bb7000000000001f65a57f8
552 rk9: .octa 0xbfd6000000000000ceae000000000000
553 rk11: .octa 0x713c0000000000001e16000000000000
554 rk13: .octa 0x80a6000000000000f7f9000000000000
555 rk15: .octa 0xe658000000000000044c000000000000
556 rk17: .octa 0xa497000000000000ad18000000000000
557 rk19: .octa 0xe7b50000000000006ee3000000000000
560 // use these values for shift constants for the tbl/tbx instruction
561 // different alignments result in values as shown:
562 // DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
563 // DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
564 // DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
565 // DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
566 // DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
567 // DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
568 // DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9 (16-7) / shr7
569 // DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8 (16-8) / shr8
570 // DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7 (16-9) / shr9
571 // DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6 (16-10) / shr10
572 // DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5 (16-11) / shr11
573 // DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4 (16-12) / shr12
574 // DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3 (16-13) / shr13
575 // DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2 (16-14) / shr14
576 // DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1 (16-15) / shr15
578 .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
579 .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
580 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
581 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0