arch/powerpc/crypto/crc32-vpmsum_core.S

   1 /* SPDX-License-Identifier: GPL-2.0-or-later */
   2 /*
   3  * Core of the accelerated CRC algorithm.
   4  * In your file, define the constants and CRC_FUNCTION_NAME
   5  * Then include this file.
   6  *
   7  * Calculate the checksum of data that is 16 byte aligned and a multiple of
   8  * 16 bytes.
   9  *
  10  * The first step is to reduce it to 1024 bits. We do this in 8 parallel
  11  * chunks in order to mask the latency of the vpmsum instructions. If we
  12  * have more than 32 kB of data to checksum we repeat this step multiple
  13  * times, passing in the previous 1024 bits.
  14  *
  15  * The next step is to reduce the 1024 bits to 64 bits. This step adds
  16  * 32 bits of 0s to the end - this matches what a CRC does. We just
  17  * calculate constants that land the data in this 32 bits.
  18  *
  19  * We then use fixed point Barrett reduction to compute a mod n over GF(2)
  20  * for n = CRC using POWER8 instructions. We use x = 32.
  21  *
  22  * http://en.wikipedia.org/wiki/Barrett_reduction
  23  *
  24  * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM
  25 */
  26
  27 #include <asm/ppc_asm.h>
  28 #include <asm/ppc-opcode.h>
  29
  30 #define MAX_SIZE        32768
  31
  32         .text
  33
  34 #if defined(__BIG_ENDIAN__) && defined(REFLECT)
  35 #define BYTESWAP_DATA
  36 #elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT)
  37 #define BYTESWAP_DATA
  38 #else
  39 #undef BYTESWAP_DATA
  40 #endif
  41
  42 #define off16           r25
  43 #define off32           r26
  44 #define off48           r27
  45 #define off64           r28
  46 #define off80           r29
  47 #define off96           r30
  48 #define off112          r31
  49
  50 #define const1          v24
  51 #define const2          v25
  52
  53 #define byteswap        v26
  54 #define mask_32bit      v27
  55 #define mask_64bit      v28
  56 #define zeroes          v29
  57
  58 #ifdef BYTESWAP_DATA
  59 #define VPERM(A, B, C, D) vperm A, B, C, D
  60 #else
  61 #define VPERM(A, B, C, D)
  62 #endif
  63
  64 /* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */
  65 FUNC_START(CRC_FUNCTION_NAME)
  66         std     r31,-8(r1)
  67         std     r30,-16(r1)
  68         std     r29,-24(r1)
  69         std     r28,-32(r1)
  70         std     r27,-40(r1)
  71         std     r26,-48(r1)
  72         std     r25,-56(r1)
  73
  74         li      off16,16
  75         li      off32,32
  76         li      off48,48
  77         li      off64,64
  78         li      off80,80
  79         li      off96,96
  80         li      off112,112
  81         li      r0,0
  82
  83         /* Enough room for saving 10 non volatile VMX registers */
  84         subi    r6,r1,56+10*16
  85         subi    r7,r1,56+2*16
  86
  87         stvx    v20,0,r6
  88         stvx    v21,off16,r6
  89         stvx    v22,off32,r6
  90         stvx    v23,off48,r6
  91         stvx    v24,off64,r6
  92         stvx    v25,off80,r6
  93         stvx    v26,off96,r6
  94         stvx    v27,off112,r6
  95         stvx    v28,0,r7
  96         stvx    v29,off16,r7
  97
  98         mr      r10,r3
  99
 100         vxor    zeroes,zeroes,zeroes
 101         vspltisw v0,-1
 102
 103         vsldoi  mask_32bit,zeroes,v0,4
 104         vsldoi  mask_64bit,zeroes,v0,8
 105
 106         /* Get the initial value into v8 */
 107         vxor    v8,v8,v8
 108         MTVRD(v8, R3)
 109 #ifdef REFLECT
 110         vsldoi  v8,zeroes,v8,8  /* shift into bottom 32 bits */
 111 #else
 112         vsldoi  v8,v8,zeroes,4  /* shift into top 32 bits */
 113 #endif
 114
 115 #ifdef BYTESWAP_DATA
 116         addis   r3,r2,.byteswap_constant@toc@ha
 117         addi    r3,r3,.byteswap_constant@toc@l
 118
 119         lvx     byteswap,0,r3
 120         addi    r3,r3,16
 121 #endif
 122
 123         cmpdi   r5,256
 124         blt     .Lshort
 125
 126         rldicr  r6,r5,0,56
 127
 128         /* Checksum in blocks of MAX_SIZE */
 129 1:      lis     r7,MAX_SIZE@h
 130         ori     r7,r7,MAX_SIZE@l
 131         mr      r9,r7
 132         cmpd    r6,r7
 133         bgt     2f
 134         mr      r7,r6
 135 2:      subf    r6,r7,r6
 136
 137         /* our main loop does 128 bytes at a time */
 138         srdi    r7,r7,7
 139
 140         /*
 141          * Work out the offset into the constants table to start at. Each
 142          * constant is 16 bytes, and it is used against 128 bytes of input
 143          * data - 128 / 16 = 8
 144          */
 145         sldi    r8,r7,4
 146         srdi    r9,r9,3
 147         subf    r8,r8,r9
 148
 149         /* We reduce our final 128 bytes in a separate step */
 150         addi    r7,r7,-1
 151         mtctr   r7
 152
 153         addis   r3,r2,.constants@toc@ha
 154         addi    r3,r3,.constants@toc@l
 155
 156         /* Find the start of our constants */
 157         add     r3,r3,r8
 158
 159         /* zero v0-v7 which will contain our checksums */
 160         vxor    v0,v0,v0
 161         vxor    v1,v1,v1
 162         vxor    v2,v2,v2
 163         vxor    v3,v3,v3
 164         vxor    v4,v4,v4
 165         vxor    v5,v5,v5
 166         vxor    v6,v6,v6
 167         vxor    v7,v7,v7
 168
 169         lvx     const1,0,r3
 170
 171         /*
 172          * If we are looping back to consume more data we use the values
 173          * already in v16-v23.
 174          */
 175         cmpdi   r0,1
 176         beq     2f
 177
 178         /* First warm up pass */
 179         lvx     v16,0,r4
 180         lvx     v17,off16,r4
 181         VPERM(v16,v16,v16,byteswap)
 182         VPERM(v17,v17,v17,byteswap)
 183         lvx     v18,off32,r4
 184         lvx     v19,off48,r4
 185         VPERM(v18,v18,v18,byteswap)
 186         VPERM(v19,v19,v19,byteswap)
 187         lvx     v20,off64,r4
 188         lvx     v21,off80,r4
 189         VPERM(v20,v20,v20,byteswap)
 190         VPERM(v21,v21,v21,byteswap)
 191         lvx     v22,off96,r4
 192         lvx     v23,off112,r4
 193         VPERM(v22,v22,v22,byteswap)
 194         VPERM(v23,v23,v23,byteswap)
 195         addi    r4,r4,8*16
 196
 197         /* xor in initial value */
 198         vxor    v16,v16,v8
 199
 200 2:      bdz     .Lfirst_warm_up_done
 201
 202         addi    r3,r3,16
 203         lvx     const2,0,r3
 204
 205         /* Second warm up pass */
 206         VPMSUMD(v8,v16,const1)
 207         lvx     v16,0,r4
 208         VPERM(v16,v16,v16,byteswap)
 209         ori     r2,r2,0
 210
 211         VPMSUMD(v9,v17,const1)
 212         lvx     v17,off16,r4
 213         VPERM(v17,v17,v17,byteswap)
 214         ori     r2,r2,0
 215
 216         VPMSUMD(v10,v18,const1)
 217         lvx     v18,off32,r4
 218         VPERM(v18,v18,v18,byteswap)
 219         ori     r2,r2,0
 220
 221         VPMSUMD(v11,v19,const1)
 222         lvx     v19,off48,r4
 223         VPERM(v19,v19,v19,byteswap)
 224         ori     r2,r2,0
 225
 226         VPMSUMD(v12,v20,const1)
 227         lvx     v20,off64,r4
 228         VPERM(v20,v20,v20,byteswap)
 229         ori     r2,r2,0
 230
 231         VPMSUMD(v13,v21,const1)
 232         lvx     v21,off80,r4
 233         VPERM(v21,v21,v21,byteswap)
 234         ori     r2,r2,0
 235
 236         VPMSUMD(v14,v22,const1)
 237         lvx     v22,off96,r4
 238         VPERM(v22,v22,v22,byteswap)
 239         ori     r2,r2,0
 240
 241         VPMSUMD(v15,v23,const1)
 242         lvx     v23,off112,r4
 243         VPERM(v23,v23,v23,byteswap)
 244
 245         addi    r4,r4,8*16
 246
 247         bdz     .Lfirst_cool_down
 248
 249         /*
 250          * main loop. We modulo schedule it such that it takes three iterations
 251          * to complete - first iteration load, second iteration vpmsum, third
 252          * iteration xor.
 253          */
 254         .balign 16
 255 4:      lvx     const1,0,r3
 256         addi    r3,r3,16
 257         ori     r2,r2,0
 258
 259         vxor    v0,v0,v8
 260         VPMSUMD(v8,v16,const2)
 261         lvx     v16,0,r4
 262         VPERM(v16,v16,v16,byteswap)
 263         ori     r2,r2,0
 264
 265         vxor    v1,v1,v9
 266         VPMSUMD(v9,v17,const2)
 267         lvx     v17,off16,r4
 268         VPERM(v17,v17,v17,byteswap)
 269         ori     r2,r2,0
 270
 271         vxor    v2,v2,v10
 272         VPMSUMD(v10,v18,const2)
 273         lvx     v18,off32,r4
 274         VPERM(v18,v18,v18,byteswap)
 275         ori     r2,r2,0
 276
 277         vxor    v3,v3,v11
 278         VPMSUMD(v11,v19,const2)
 279         lvx     v19,off48,r4
 280         VPERM(v19,v19,v19,byteswap)
 281         lvx     const2,0,r3
 282         ori     r2,r2,0
 283
 284         vxor    v4,v4,v12
 285         VPMSUMD(v12,v20,const1)
 286         lvx     v20,off64,r4
 287         VPERM(v20,v20,v20,byteswap)
 288         ori     r2,r2,0
 289
 290         vxor    v5,v5,v13
 291         VPMSUMD(v13,v21,const1)
 292         lvx     v21,off80,r4
 293         VPERM(v21,v21,v21,byteswap)
 294         ori     r2,r2,0
 295
 296         vxor    v6,v6,v14
 297         VPMSUMD(v14,v22,const1)
 298         lvx     v22,off96,r4
 299         VPERM(v22,v22,v22,byteswap)
 300         ori     r2,r2,0
 301
 302         vxor    v7,v7,v15
 303         VPMSUMD(v15,v23,const1)
 304         lvx     v23,off112,r4
 305         VPERM(v23,v23,v23,byteswap)
 306
 307         addi    r4,r4,8*16
 308
 309         bdnz    4b
 310
 311 .Lfirst_cool_down:
 312         /* First cool down pass */
 313         lvx     const1,0,r3
 314         addi    r3,r3,16
 315
 316         vxor    v0,v0,v8
 317         VPMSUMD(v8,v16,const1)
 318         ori     r2,r2,0
 319
 320         vxor    v1,v1,v9
 321         VPMSUMD(v9,v17,const1)
 322         ori     r2,r2,0
 323
 324         vxor    v2,v2,v10
 325         VPMSUMD(v10,v18,const1)
 326         ori     r2,r2,0
 327
 328         vxor    v3,v3,v11
 329         VPMSUMD(v11,v19,const1)
 330         ori     r2,r2,0
 331
 332         vxor    v4,v4,v12
 333         VPMSUMD(v12,v20,const1)
 334         ori     r2,r2,0
 335
 336         vxor    v5,v5,v13
 337         VPMSUMD(v13,v21,const1)
 338         ori     r2,r2,0
 339
 340         vxor    v6,v6,v14
 341         VPMSUMD(v14,v22,const1)
 342         ori     r2,r2,0
 343
 344         vxor    v7,v7,v15
 345         VPMSUMD(v15,v23,const1)
 346         ori     r2,r2,0
 347
 348 .Lsecond_cool_down:
 349         /* Second cool down pass */
 350         vxor    v0,v0,v8
 351         vxor    v1,v1,v9
 352         vxor    v2,v2,v10
 353         vxor    v3,v3,v11
 354         vxor    v4,v4,v12
 355         vxor    v5,v5,v13
 356         vxor    v6,v6,v14
 357         vxor    v7,v7,v15
 358
 359 #ifdef REFLECT
 360         /*
 361          * vpmsumd produces a 96 bit result in the least significant bits
 362          * of the register. Since we are bit reflected we have to shift it
 363          * left 32 bits so it occupies the least significant bits in the
 364          * bit reflected domain.
 365          */
 366         vsldoi  v0,v0,zeroes,4
 367         vsldoi  v1,v1,zeroes,4
 368         vsldoi  v2,v2,zeroes,4
 369         vsldoi  v3,v3,zeroes,4
 370         vsldoi  v4,v4,zeroes,4
 371         vsldoi  v5,v5,zeroes,4
 372         vsldoi  v6,v6,zeroes,4
 373         vsldoi  v7,v7,zeroes,4
 374 #endif
 375
 376         /* xor with last 1024 bits */
 377         lvx     v8,0,r4
 378         lvx     v9,off16,r4
 379         VPERM(v8,v8,v8,byteswap)
 380         VPERM(v9,v9,v9,byteswap)
 381         lvx     v10,off32,r4
 382         lvx     v11,off48,r4
 383         VPERM(v10,v10,v10,byteswap)
 384         VPERM(v11,v11,v11,byteswap)
 385         lvx     v12,off64,r4
 386         lvx     v13,off80,r4
 387         VPERM(v12,v12,v12,byteswap)
 388         VPERM(v13,v13,v13,byteswap)
 389         lvx     v14,off96,r4
 390         lvx     v15,off112,r4
 391         VPERM(v14,v14,v14,byteswap)
 392         VPERM(v15,v15,v15,byteswap)
 393
 394         addi    r4,r4,8*16
 395
 396         vxor    v16,v0,v8
 397         vxor    v17,v1,v9
 398         vxor    v18,v2,v10
 399         vxor    v19,v3,v11
 400         vxor    v20,v4,v12
 401         vxor    v21,v5,v13
 402         vxor    v22,v6,v14
 403         vxor    v23,v7,v15
 404
 405         li      r0,1
 406         cmpdi   r6,0
 407         addi    r6,r6,128
 408         bne     1b
 409
 410         /* Work out how many bytes we have left */
 411         andi.   r5,r5,127
 412
 413         /* Calculate where in the constant table we need to start */
 414         subfic  r6,r5,128
 415         add     r3,r3,r6
 416
 417         /* How many 16 byte chunks are in the tail */
 418         srdi    r7,r5,4
 419         mtctr   r7
 420
 421         /*
 422          * Reduce the previously calculated 1024 bits to 64 bits, shifting
 423          * 32 bits to include the trailing 32 bits of zeros
 424          */
 425         lvx     v0,0,r3
 426         lvx     v1,off16,r3
 427         lvx     v2,off32,r3
 428         lvx     v3,off48,r3
 429         lvx     v4,off64,r3
 430         lvx     v5,off80,r3
 431         lvx     v6,off96,r3
 432         lvx     v7,off112,r3
 433         addi    r3,r3,8*16
 434
 435         VPMSUMW(v0,v16,v0)
 436         VPMSUMW(v1,v17,v1)
 437         VPMSUMW(v2,v18,v2)
 438         VPMSUMW(v3,v19,v3)
 439         VPMSUMW(v4,v20,v4)
 440         VPMSUMW(v5,v21,v5)
 441         VPMSUMW(v6,v22,v6)
 442         VPMSUMW(v7,v23,v7)
 443
 444         /* Now reduce the tail (0 - 112 bytes) */
 445         cmpdi   r7,0
 446         beq     1f
 447
 448         lvx     v16,0,r4
 449         lvx     v17,0,r3
 450         VPERM(v16,v16,v16,byteswap)
 451         VPMSUMW(v16,v16,v17)
 452         vxor    v0,v0,v16
 453         bdz     1f
 454
 455         lvx     v16,off16,r4
 456         lvx     v17,off16,r3
 457         VPERM(v16,v16,v16,byteswap)
 458         VPMSUMW(v16,v16,v17)
 459         vxor    v0,v0,v16
 460         bdz     1f
 461
 462         lvx     v16,off32,r4
 463         lvx     v17,off32,r3
 464         VPERM(v16,v16,v16,byteswap)
 465         VPMSUMW(v16,v16,v17)
 466         vxor    v0,v0,v16
 467         bdz     1f
 468
 469         lvx     v16,off48,r4
 470         lvx     v17,off48,r3
 471         VPERM(v16,v16,v16,byteswap)
 472         VPMSUMW(v16,v16,v17)
 473         vxor    v0,v0,v16
 474         bdz     1f
 475
 476         lvx     v16,off64,r4
 477         lvx     v17,off64,r3
 478         VPERM(v16,v16,v16,byteswap)
 479         VPMSUMW(v16,v16,v17)
 480         vxor    v0,v0,v16
 481         bdz     1f
 482
 483         lvx     v16,off80,r4
 484         lvx     v17,off80,r3
 485         VPERM(v16,v16,v16,byteswap)
 486         VPMSUMW(v16,v16,v17)
 487         vxor    v0,v0,v16
 488         bdz     1f
 489
 490         lvx     v16,off96,r4
 491         lvx     v17,off96,r3
 492         VPERM(v16,v16,v16,byteswap)
 493         VPMSUMW(v16,v16,v17)
 494         vxor    v0,v0,v16
 495
 496         /* Now xor all the parallel chunks together */
 497 1:      vxor    v0,v0,v1
 498         vxor    v2,v2,v3
 499         vxor    v4,v4,v5
 500         vxor    v6,v6,v7
 501
 502         vxor    v0,v0,v2
 503         vxor    v4,v4,v6
 504
 505         vxor    v0,v0,v4
 506
 507 .Lbarrett_reduction:
 508         /* Barrett constants */
 509         addis   r3,r2,.barrett_constants@toc@ha
 510         addi    r3,r3,.barrett_constants@toc@l
 511
 512         lvx     const1,0,r3
 513         lvx     const2,off16,r3
 514
 515         vsldoi  v1,v0,v0,8
 516         vxor    v0,v0,v1                /* xor two 64 bit results together */
 517
 518 #ifdef REFLECT
 519         /* shift left one bit */
 520         vspltisb v1,1
 521         vsl     v0,v0,v1
 522 #endif
 523
 524         vand    v0,v0,mask_64bit
 525 #ifndef REFLECT
 526         /*
 527          * Now for the Barrett reduction algorithm. The idea is to calculate q,
 528          * the multiple of our polynomial that we need to subtract. By
 529          * doing the computation 2x bits higher (ie 64 bits) and shifting the
 530          * result back down 2x bits, we round down to the nearest multiple.
 531          */
 532         VPMSUMD(v1,v0,const1)   /* ma */
 533         vsldoi  v1,zeroes,v1,8  /* q = floor(ma/(2^64)) */
 534         VPMSUMD(v1,v1,const2)   /* qn */
 535         vxor    v0,v0,v1        /* a - qn, subtraction is xor in GF(2) */
 536
 537         /*
 538          * Get the result into r3. We need to shift it left 8 bytes:
 539          * V0 [ 0 1 2 X ]
 540          * V0 [ 0 X 2 3 ]
 541          */
 542         vsldoi  v0,v0,zeroes,8  /* shift result into top 64 bits */
 543 #else
 544         /*
 545          * The reflected version of Barrett reduction. Instead of bit
 546          * reflecting our data (which is expensive to do), we bit reflect our
 547          * constants and our algorithm, which means the intermediate data in
 548          * our vector registers goes from 0-63 instead of 63-0. We can reflect
 549          * the algorithm because we don't carry in mod 2 arithmetic.
 550          */
 551         vand    v1,v0,mask_32bit        /* bottom 32 bits of a */
 552         VPMSUMD(v1,v1,const1)           /* ma */
 553         vand    v1,v1,mask_32bit        /* bottom 32bits of ma */
 554         VPMSUMD(v1,v1,const2)           /* qn */
 555         vxor    v0,v0,v1                /* a - qn, subtraction is xor in GF(2) */
 556
 557         /*
 558          * Since we are bit reflected, the result (ie the low 32 bits) is in
 559          * the high 32 bits. We just need to shift it left 4 bytes
 560          * V0 [ 0 1 X 3 ]
 561          * V0 [ 0 X 2 3 ]
 562          */
 563         vsldoi  v0,v0,zeroes,4          /* shift result into top 64 bits of */
 564 #endif
 565
 566         /* Get it into r3 */
 567         MFVRD(R3, v0)
 568
 569 .Lout:
 570         subi    r6,r1,56+10*16
 571         subi    r7,r1,56+2*16
 572
 573         lvx     v20,0,r6
 574         lvx     v21,off16,r6
 575         lvx     v22,off32,r6
 576         lvx     v23,off48,r6
 577         lvx     v24,off64,r6
 578         lvx     v25,off80,r6
 579         lvx     v26,off96,r6
 580         lvx     v27,off112,r6
 581         lvx     v28,0,r7
 582         lvx     v29,off16,r7
 583
 584         ld      r31,-8(r1)
 585         ld      r30,-16(r1)
 586         ld      r29,-24(r1)
 587         ld      r28,-32(r1)
 588         ld      r27,-40(r1)
 589         ld      r26,-48(r1)
 590         ld      r25,-56(r1)
 591
 592         blr
 593
 594 .Lfirst_warm_up_done:
 595         lvx     const1,0,r3
 596         addi    r3,r3,16
 597
 598         VPMSUMD(v8,v16,const1)
 599         VPMSUMD(v9,v17,const1)
 600         VPMSUMD(v10,v18,const1)
 601         VPMSUMD(v11,v19,const1)
 602         VPMSUMD(v12,v20,const1)
 603         VPMSUMD(v13,v21,const1)
 604         VPMSUMD(v14,v22,const1)
 605         VPMSUMD(v15,v23,const1)
 606
 607         b       .Lsecond_cool_down
 608
 609 .Lshort:
 610         cmpdi   r5,0
 611         beq     .Lzero
 612
 613         addis   r3,r2,.short_constants@toc@ha
 614         addi    r3,r3,.short_constants@toc@l
 615
 616         /* Calculate where in the constant table we need to start */
 617         subfic  r6,r5,256
 618         add     r3,r3,r6
 619
 620         /* How many 16 byte chunks? */
 621         srdi    r7,r5,4
 622         mtctr   r7
 623
 624         vxor    v19,v19,v19
 625         vxor    v20,v20,v20
 626
 627         lvx     v0,0,r4
 628         lvx     v16,0,r3
 629         VPERM(v0,v0,v16,byteswap)
 630         vxor    v0,v0,v8        /* xor in initial value */
 631         VPMSUMW(v0,v0,v16)
 632         bdz     .Lv0
 633
 634         lvx     v1,off16,r4
 635         lvx     v17,off16,r3
 636         VPERM(v1,v1,v17,byteswap)
 637         VPMSUMW(v1,v1,v17)
 638         bdz     .Lv1
 639
 640         lvx     v2,off32,r4
 641         lvx     v16,off32,r3
 642         VPERM(v2,v2,v16,byteswap)
 643         VPMSUMW(v2,v2,v16)
 644         bdz     .Lv2
 645
 646         lvx     v3,off48,r4
 647         lvx     v17,off48,r3
 648         VPERM(v3,v3,v17,byteswap)
 649         VPMSUMW(v3,v3,v17)
 650         bdz     .Lv3
 651
 652         lvx     v4,off64,r4
 653         lvx     v16,off64,r3
 654         VPERM(v4,v4,v16,byteswap)
 655         VPMSUMW(v4,v4,v16)
 656         bdz     .Lv4
 657
 658         lvx     v5,off80,r4
 659         lvx     v17,off80,r3
 660         VPERM(v5,v5,v17,byteswap)
 661         VPMSUMW(v5,v5,v17)
 662         bdz     .Lv5
 663
 664         lvx     v6,off96,r4
 665         lvx     v16,off96,r3
 666         VPERM(v6,v6,v16,byteswap)
 667         VPMSUMW(v6,v6,v16)
 668         bdz     .Lv6
 669
 670         lvx     v7,off112,r4
 671         lvx     v17,off112,r3
 672         VPERM(v7,v7,v17,byteswap)
 673         VPMSUMW(v7,v7,v17)
 674         bdz     .Lv7
 675
 676         addi    r3,r3,128
 677         addi    r4,r4,128
 678
 679         lvx     v8,0,r4
 680         lvx     v16,0,r3
 681         VPERM(v8,v8,v16,byteswap)
 682         VPMSUMW(v8,v8,v16)
 683         bdz     .Lv8
 684
 685         lvx     v9,off16,r4
 686         lvx     v17,off16,r3
 687         VPERM(v9,v9,v17,byteswap)
 688         VPMSUMW(v9,v9,v17)
 689         bdz     .Lv9
 690
 691         lvx     v10,off32,r4
 692         lvx     v16,off32,r3
 693         VPERM(v10,v10,v16,byteswap)
 694         VPMSUMW(v10,v10,v16)
 695         bdz     .Lv10
 696
 697         lvx     v11,off48,r4
 698         lvx     v17,off48,r3
 699         VPERM(v11,v11,v17,byteswap)
 700         VPMSUMW(v11,v11,v17)
 701         bdz     .Lv11
 702
 703         lvx     v12,off64,r4
 704         lvx     v16,off64,r3
 705         VPERM(v12,v12,v16,byteswap)
 706         VPMSUMW(v12,v12,v16)
 707         bdz     .Lv12
 708
 709         lvx     v13,off80,r4
 710         lvx     v17,off80,r3
 711         VPERM(v13,v13,v17,byteswap)
 712         VPMSUMW(v13,v13,v17)
 713         bdz     .Lv13
 714
 715         lvx     v14,off96,r4
 716         lvx     v16,off96,r3
 717         VPERM(v14,v14,v16,byteswap)
 718         VPMSUMW(v14,v14,v16)
 719         bdz     .Lv14
 720
 721         lvx     v15,off112,r4
 722         lvx     v17,off112,r3
 723         VPERM(v15,v15,v17,byteswap)
 724         VPMSUMW(v15,v15,v17)
 725
 726 .Lv15:  vxor    v19,v19,v15
 727 .Lv14:  vxor    v20,v20,v14
 728 .Lv13:  vxor    v19,v19,v13
 729 .Lv12:  vxor    v20,v20,v12
 730 .Lv11:  vxor    v19,v19,v11
 731 .Lv10:  vxor    v20,v20,v10
 732 .Lv9:   vxor    v19,v19,v9
 733 .Lv8:   vxor    v20,v20,v8
 734 .Lv7:   vxor    v19,v19,v7
 735 .Lv6:   vxor    v20,v20,v6
 736 .Lv5:   vxor    v19,v19,v5
 737 .Lv4:   vxor    v20,v20,v4
 738 .Lv3:   vxor    v19,v19,v3
 739 .Lv2:   vxor    v20,v20,v2
 740 .Lv1:   vxor    v19,v19,v1
 741 .Lv0:   vxor    v20,v20,v0
 742
 743         vxor    v0,v19,v20
 744
 745         b       .Lbarrett_reduction
 746
 747 .Lzero:
 748         mr      r3,r10
 749         b       .Lout
 750
 751 FUNC_END(CRC_FUNCTION_NAME)