arch/arm64/crypto/crct10dif-ce-core.S

   1 //
   2 // Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
   3 //
   4 // Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
   5 //
   6 // This program is free software; you can redistribute it and/or modify
   7 // it under the terms of the GNU General Public License version 2 as
   8 // published by the Free Software Foundation.
   9 //
  10
  11 //
  12 // Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
  13 //
  14 // Copyright (c) 2013, Intel Corporation
  15 //
  16 // Authors:
  17 //     Erdinc Ozturk <erdinc.ozturk@intel.com>
  18 //     Vinodh Gopal <vinodh.gopal@intel.com>
  19 //     James Guilford <james.guilford@intel.com>
  20 //     Tim Chen <tim.c.chen@linux.intel.com>
  21 //
  22 // This software is available to you under a choice of one of two
  23 // licenses.  You may choose to be licensed under the terms of the GNU
  24 // General Public License (GPL) Version 2, available from the file
  25 // COPYING in the main directory of this source tree, or the
  26 // OpenIB.org BSD license below:
  27 //
  28 // Redistribution and use in source and binary forms, with or without
  29 // modification, are permitted provided that the following conditions are
  30 // met:
  31 //
  32 // * Redistributions of source code must retain the above copyright
  33 //   notice, this list of conditions and the following disclaimer.
  34 //
  35 // * Redistributions in binary form must reproduce the above copyright
  36 //   notice, this list of conditions and the following disclaimer in the
  37 //   documentation and/or other materials provided with the
  38 //   distribution.
  39 //
  40 // * Neither the name of the Intel Corporation nor the names of its
  41 //   contributors may be used to endorse or promote products derived from
  42 //   this software without specific prior written permission.
  43 //
  44 //
  45 // THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
  46 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  47 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  48 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
  49 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  50 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  51 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  52 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  53 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  54 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  55 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  56 //
  57 //       Function API:
  58 //       UINT16 crc_t10dif_pcl(
  59 //               UINT16 init_crc, //initial CRC value, 16 bits
  60 //               const unsigned char *buf, //buffer pointer to calculate CRC on
  61 //               UINT64 len //buffer length in bytes (64-bit data)
  62 //       );
  63 //
  64 //       Reference paper titled "Fast CRC Computation for Generic
  65 //      Polynomials Using PCLMULQDQ Instruction"
  66 //       URL: http://www.intel.com/content/dam/www/public/us/en/documents
  67 //  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
  68 //
  69 //
  70
  71 #include <linux/linkage.h>
  72 #include <asm/assembler.h>
  73
  74         .text
  75         .cpu            generic+crypto
  76
  77         arg1_low32      .req    w19
  78         arg2            .req    x20
  79         arg3            .req    x21
  80
  81         vzr             .req    v13
  82
  83         ad              .req    v14
  84         bd              .req    v10
  85
  86         k00_16          .req    v15
  87         k32_48          .req    v16
  88
  89         t3              .req    v17
  90         t4              .req    v18
  91         t5              .req    v19
  92         t6              .req    v20
  93         t7              .req    v21
  94         t8              .req    v22
  95         t9              .req    v23
  96
  97         perm1           .req    v24
  98         perm2           .req    v25
  99         perm3           .req    v26
 100         perm4           .req    v27
 101
 102         bd1             .req    v28
 103         bd2             .req    v29
 104         bd3             .req    v30
 105         bd4             .req    v31
 106
 107         .macro          __pmull_init_p64
 108         .endm
 109
 110         .macro          __pmull_pre_p64, bd
 111         .endm
 112
 113         .macro          __pmull_init_p8
 114         // k00_16 := 0x0000000000000000_000000000000ffff
 115         // k32_48 := 0x00000000ffffffff_0000ffffffffffff
 116         movi            k32_48.2d, #0xffffffff
 117         mov             k32_48.h[2], k32_48.h[0]
 118         ushr            k00_16.2d, k32_48.2d, #32
 119
 120         // prepare the permutation vectors
 121         mov_q           x5, 0x080f0e0d0c0b0a09
 122         movi            perm4.8b, #8
 123         dup             perm1.2d, x5
 124         eor             perm1.16b, perm1.16b, perm4.16b
 125         ushr            perm2.2d, perm1.2d, #8
 126         ushr            perm3.2d, perm1.2d, #16
 127         ushr            perm4.2d, perm1.2d, #24
 128         sli             perm2.2d, perm1.2d, #56
 129         sli             perm3.2d, perm1.2d, #48
 130         sli             perm4.2d, perm1.2d, #40
 131         .endm
 132
 133         .macro          __pmull_pre_p8, bd
 134         tbl             bd1.16b, {\bd\().16b}, perm1.16b
 135         tbl             bd2.16b, {\bd\().16b}, perm2.16b
 136         tbl             bd3.16b, {\bd\().16b}, perm3.16b
 137         tbl             bd4.16b, {\bd\().16b}, perm4.16b
 138         .endm
 139
 140 __pmull_p8_core:
 141 .L__pmull_p8_core:
 142         ext             t4.8b, ad.8b, ad.8b, #1                 // A1
 143         ext             t5.8b, ad.8b, ad.8b, #2                 // A2
 144         ext             t6.8b, ad.8b, ad.8b, #3                 // A3
 145
 146         pmull           t4.8h, t4.8b, bd.8b                     // F = A1*B
 147         pmull           t8.8h, ad.8b, bd1.8b                    // E = A*B1
 148         pmull           t5.8h, t5.8b, bd.8b                     // H = A2*B
 149         pmull           t7.8h, ad.8b, bd2.8b                    // G = A*B2
 150         pmull           t6.8h, t6.8b, bd.8b                     // J = A3*B
 151         pmull           t9.8h, ad.8b, bd3.8b                    // I = A*B3
 152         pmull           t3.8h, ad.8b, bd4.8b                    // K = A*B4
 153         b               0f
 154
 155 .L__pmull_p8_core2:
 156         tbl             t4.16b, {ad.16b}, perm1.16b             // A1
 157         tbl             t5.16b, {ad.16b}, perm2.16b             // A2
 158         tbl             t6.16b, {ad.16b}, perm3.16b             // A3
 159
 160         pmull2          t4.8h, t4.16b, bd.16b                   // F = A1*B
 161         pmull2          t8.8h, ad.16b, bd1.16b                  // E = A*B1
 162         pmull2          t5.8h, t5.16b, bd.16b                   // H = A2*B
 163         pmull2          t7.8h, ad.16b, bd2.16b                  // G = A*B2
 164         pmull2          t6.8h, t6.16b, bd.16b                   // J = A3*B
 165         pmull2          t9.8h, ad.16b, bd3.16b                  // I = A*B3
 166         pmull2          t3.8h, ad.16b, bd4.16b                  // K = A*B4
 167
 168 0:      eor             t4.16b, t4.16b, t8.16b                  // L = E + F
 169         eor             t5.16b, t5.16b, t7.16b                  // M = G + H
 170         eor             t6.16b, t6.16b, t9.16b                  // N = I + J
 171
 172         uzp1            t8.2d, t4.2d, t5.2d
 173         uzp2            t4.2d, t4.2d, t5.2d
 174         uzp1            t7.2d, t6.2d, t3.2d
 175         uzp2            t6.2d, t6.2d, t3.2d
 176
 177         // t4 = (L) (P0 + P1) << 8
 178         // t5 = (M) (P2 + P3) << 16
 179         eor             t8.16b, t8.16b, t4.16b
 180         and             t4.16b, t4.16b, k32_48.16b
 181
 182         // t6 = (N) (P4 + P5) << 24
 183         // t7 = (K) (P6 + P7) << 32
 184         eor             t7.16b, t7.16b, t6.16b
 185         and             t6.16b, t6.16b, k00_16.16b
 186
 187         eor             t8.16b, t8.16b, t4.16b
 188         eor             t7.16b, t7.16b, t6.16b
 189
 190         zip2            t5.2d, t8.2d, t4.2d
 191         zip1            t4.2d, t8.2d, t4.2d
 192         zip2            t3.2d, t7.2d, t6.2d
 193         zip1            t6.2d, t7.2d, t6.2d
 194
 195         ext             t4.16b, t4.16b, t4.16b, #15
 196         ext             t5.16b, t5.16b, t5.16b, #14
 197         ext             t6.16b, t6.16b, t6.16b, #13
 198         ext             t3.16b, t3.16b, t3.16b, #12
 199
 200         eor             t4.16b, t4.16b, t5.16b
 201         eor             t6.16b, t6.16b, t3.16b
 202         ret
 203 ENDPROC(__pmull_p8_core)
 204
 205         .macro          __pmull_p8, rq, ad, bd, i
 206         .ifnc           \bd, v10
 207         .err
 208         .endif
 209         mov             ad.16b, \ad\().16b
 210         .ifb            \i
 211         pmull           \rq\().8h, \ad\().8b, bd.8b             // D = A*B
 212         .else
 213         pmull2          \rq\().8h, \ad\().16b, bd.16b           // D = A*B
 214         .endif
 215
 216         bl              .L__pmull_p8_core\i
 217
 218         eor             \rq\().16b, \rq\().16b, t4.16b
 219         eor             \rq\().16b, \rq\().16b, t6.16b
 220         .endm
 221
 222         .macro          fold64, p, reg1, reg2
 223         ldp             q11, q12, [arg2], #0x20
 224
 225         __pmull_\p      v8, \reg1, v10, 2
 226         __pmull_\p      \reg1, \reg1, v10
 227
 228 CPU_LE( rev64           v11.16b, v11.16b                )
 229 CPU_LE( rev64           v12.16b, v12.16b                )
 230
 231         __pmull_\p      v9, \reg2, v10, 2
 232         __pmull_\p      \reg2, \reg2, v10
 233
 234 CPU_LE( ext             v11.16b, v11.16b, v11.16b, #8   )
 235 CPU_LE( ext             v12.16b, v12.16b, v12.16b, #8   )
 236
 237         eor             \reg1\().16b, \reg1\().16b, v8.16b
 238         eor             \reg2\().16b, \reg2\().16b, v9.16b
 239         eor             \reg1\().16b, \reg1\().16b, v11.16b
 240         eor             \reg2\().16b, \reg2\().16b, v12.16b
 241         .endm
 242
 243         .macro          fold16, p, reg, rk
 244         __pmull_\p      v8, \reg, v10
 245         __pmull_\p      \reg, \reg, v10, 2
 246         .ifnb           \rk
 247         ldr_l           q10, \rk, x8
 248         __pmull_pre_\p  v10
 249         .endif
 250         eor             v7.16b, v7.16b, v8.16b
 251         eor             v7.16b, v7.16b, \reg\().16b
 252         .endm
 253
 254         .macro          __pmull_p64, rd, rn, rm, n
 255         .ifb            \n
 256         pmull           \rd\().1q, \rn\().1d, \rm\().1d
 257         .else
 258         pmull2          \rd\().1q, \rn\().2d, \rm\().2d
 259         .endif
 260         .endm
 261
 262         .macro          crc_t10dif_pmull, p
 263         frame_push      3, 128
 264
 265         mov             arg1_low32, w0
 266         mov             arg2, x1
 267         mov             arg3, x2
 268
 269         movi            vzr.16b, #0             // init zero register
 270
 271         __pmull_init_\p
 272
 273         // adjust the 16-bit initial_crc value, scale it to 32 bits
 274         lsl             arg1_low32, arg1_low32, #16
 275
 276         // check if smaller than 256
 277         cmp             arg3, #256
 278
 279         // for sizes less than 128, we can't fold 64B at a time...
 280         b.lt            .L_less_than_128_\@
 281
 282         // load the initial crc value
 283         // crc value does not need to be byte-reflected, but it needs
 284         // to be moved to the high part of the register.
 285         // because data will be byte-reflected and will align with
 286         // initial crc at correct place.
 287         movi            v10.16b, #0
 288         mov             v10.s[3], arg1_low32            // initial crc
 289
 290         // receive the initial 64B data, xor the initial crc value
 291         ldp             q0, q1, [arg2]
 292         ldp             q2, q3, [arg2, #0x20]
 293         ldp             q4, q5, [arg2, #0x40]
 294         ldp             q6, q7, [arg2, #0x60]
 295         add             arg2, arg2, #0x80
 296
 297 CPU_LE( rev64           v0.16b, v0.16b                  )
 298 CPU_LE( rev64           v1.16b, v1.16b                  )
 299 CPU_LE( rev64           v2.16b, v2.16b                  )
 300 CPU_LE( rev64           v3.16b, v3.16b                  )
 301 CPU_LE( rev64           v4.16b, v4.16b                  )
 302 CPU_LE( rev64           v5.16b, v5.16b                  )
 303 CPU_LE( rev64           v6.16b, v6.16b                  )
 304 CPU_LE( rev64           v7.16b, v7.16b                  )
 305
 306 CPU_LE( ext             v0.16b, v0.16b, v0.16b, #8      )
 307 CPU_LE( ext             v1.16b, v1.16b, v1.16b, #8      )
 308 CPU_LE( ext             v2.16b, v2.16b, v2.16b, #8      )
 309 CPU_LE( ext             v3.16b, v3.16b, v3.16b, #8      )
 310 CPU_LE( ext             v4.16b, v4.16b, v4.16b, #8      )
 311 CPU_LE( ext             v5.16b, v5.16b, v5.16b, #8      )
 312 CPU_LE( ext             v6.16b, v6.16b, v6.16b, #8      )
 313 CPU_LE( ext             v7.16b, v7.16b, v7.16b, #8      )
 314
 315         // XOR the initial_crc value
 316         eor             v0.16b, v0.16b, v10.16b
 317
 318         ldr_l           q10, rk3, x8    // xmm10 has rk3 and rk4
 319                                         // type of pmull instruction
 320                                         // will determine which constant to use
 321         __pmull_pre_\p  v10
 322
 323         //
 324         // we subtract 256 instead of 128 to save one instruction from the loop
 325         //
 326         sub             arg3, arg3, #256
 327
 328         // at this section of the code, there is 64*x+y (0<=y<64) bytes of
 329         // buffer. The _fold_64_B_loop will fold 64B at a time
 330         // until we have 64+y Bytes of buffer
 331
 332         // fold 64B at a time. This section of the code folds 4 vector
 333         // registers in parallel
 334 .L_fold_64_B_loop_\@:
 335
 336         fold64          \p, v0, v1
 337         fold64          \p, v2, v3
 338         fold64          \p, v4, v5
 339         fold64          \p, v6, v7
 340
 341         subs            arg3, arg3, #128
 342
 343         // check if there is another 64B in the buffer to be able to fold
 344         b.lt            .L_fold_64_B_end_\@
 345
 346         if_will_cond_yield_neon
 347         stp             q0, q1, [sp, #.Lframe_local_offset]
 348         stp             q2, q3, [sp, #.Lframe_local_offset + 32]
 349         stp             q4, q5, [sp, #.Lframe_local_offset + 64]
 350         stp             q6, q7, [sp, #.Lframe_local_offset + 96]
 351         do_cond_yield_neon
 352         ldp             q0, q1, [sp, #.Lframe_local_offset]
 353         ldp             q2, q3, [sp, #.Lframe_local_offset + 32]
 354         ldp             q4, q5, [sp, #.Lframe_local_offset + 64]
 355         ldp             q6, q7, [sp, #.Lframe_local_offset + 96]
 356         ldr_l           q10, rk3, x8
 357         movi            vzr.16b, #0             // init zero register
 358         __pmull_init_\p
 359         __pmull_pre_\p  v10
 360         endif_yield_neon
 361
 362         b               .L_fold_64_B_loop_\@
 363
 364 .L_fold_64_B_end_\@:
 365         // at this point, the buffer pointer is pointing at the last y Bytes
 366         // of the buffer the 64B of folded data is in 4 of the vector
 367         // registers: v0, v1, v2, v3
 368
 369         // fold the 8 vector registers to 1 vector register with different
 370         // constants
 371
 372         ldr_l           q10, rk9, x8
 373         __pmull_pre_\p  v10
 374
 375         fold16          \p, v0, rk11
 376         fold16          \p, v1, rk13
 377         fold16          \p, v2, rk15
 378         fold16          \p, v3, rk17
 379         fold16          \p, v4, rk19
 380         fold16          \p, v5, rk1
 381         fold16          \p, v6
 382
 383         // instead of 64, we add 48 to the loop counter to save 1 instruction
 384         // from the loop instead of a cmp instruction, we use the negative
 385         // flag with the jl instruction
 386         adds            arg3, arg3, #(128-16)
 387         b.lt            .L_final_reduction_for_128_\@
 388
 389         // now we have 16+y bytes left to reduce. 16 Bytes is in register v7
 390         // and the rest is in memory. We can fold 16 bytes at a time if y>=16
 391         // continue folding 16B at a time
 392
 393 .L_16B_reduction_loop_\@:
 394         __pmull_\p      v8, v7, v10
 395         __pmull_\p      v7, v7, v10, 2
 396         eor             v7.16b, v7.16b, v8.16b
 397
 398         ldr             q0, [arg2], #16
 399 CPU_LE( rev64           v0.16b, v0.16b                  )
 400 CPU_LE( ext             v0.16b, v0.16b, v0.16b, #8      )
 401         eor             v7.16b, v7.16b, v0.16b
 402         subs            arg3, arg3, #16
 403
 404         // instead of a cmp instruction, we utilize the flags with the
 405         // jge instruction equivalent of: cmp arg3, 16-16
 406         // check if there is any more 16B in the buffer to be able to fold
 407         b.ge            .L_16B_reduction_loop_\@
 408
 409         // now we have 16+z bytes left to reduce, where 0<= z < 16.
 410         // first, we reduce the data in the xmm7 register
 411
 412 .L_final_reduction_for_128_\@:
 413         // check if any more data to fold. If not, compute the CRC of
 414         // the final 128 bits
 415         adds            arg3, arg3, #16
 416         b.eq            .L_128_done_\@
 417
 418         // here we are getting data that is less than 16 bytes.
 419         // since we know that there was data before the pointer, we can
 420         // offset the input pointer before the actual point, to receive
 421         // exactly 16 bytes. after that the registers need to be adjusted.
 422 .L_get_last_two_regs_\@:
 423         add             arg2, arg2, arg3
 424         ldr             q1, [arg2, #-16]
 425 CPU_LE( rev64           v1.16b, v1.16b                  )
 426 CPU_LE( ext             v1.16b, v1.16b, v1.16b, #8      )
 427
 428         // get rid of the extra data that was loaded before
 429         // load the shift constant
 430         adr_l           x4, tbl_shf_table + 16
 431         sub             x4, x4, arg3
 432         ld1             {v0.16b}, [x4]
 433
 434         // shift v2 to the left by arg3 bytes
 435         tbl             v2.16b, {v7.16b}, v0.16b
 436
 437         // shift v7 to the right by 16-arg3 bytes
 438         movi            v9.16b, #0x80
 439         eor             v0.16b, v0.16b, v9.16b
 440         tbl             v7.16b, {v7.16b}, v0.16b
 441
 442         // blend
 443         sshr            v0.16b, v0.16b, #7      // convert to 8-bit mask
 444         bsl             v0.16b, v2.16b, v1.16b
 445
 446         // fold 16 Bytes
 447         __pmull_\p      v8, v7, v10
 448         __pmull_\p      v7, v7, v10, 2
 449         eor             v7.16b, v7.16b, v8.16b
 450         eor             v7.16b, v7.16b, v0.16b
 451
 452 .L_128_done_\@:
 453         // compute crc of a 128-bit value
 454         ldr_l           q10, rk5, x8            // rk5 and rk6 in xmm10
 455         __pmull_pre_\p  v10
 456
 457         // 64b fold
 458         ext             v0.16b, vzr.16b, v7.16b, #8
 459         mov             v7.d[0], v7.d[1]
 460         __pmull_\p      v7, v7, v10
 461         eor             v7.16b, v7.16b, v0.16b
 462
 463         // 32b fold
 464         ext             v0.16b, v7.16b, vzr.16b, #4
 465         mov             v7.s[3], vzr.s[0]
 466         __pmull_\p      v0, v0, v10, 2
 467         eor             v7.16b, v7.16b, v0.16b
 468
 469         // barrett reduction
 470         ldr_l           q10, rk7, x8
 471         __pmull_pre_\p  v10
 472         mov             v0.d[0], v7.d[1]
 473
 474         __pmull_\p      v0, v0, v10
 475         ext             v0.16b, vzr.16b, v0.16b, #12
 476         __pmull_\p      v0, v0, v10, 2
 477         ext             v0.16b, vzr.16b, v0.16b, #12
 478         eor             v7.16b, v7.16b, v0.16b
 479         mov             w0, v7.s[1]
 480
 481 .L_cleanup_\@:
 482         // scale the result back to 16 bits
 483         lsr             x0, x0, #16
 484         frame_pop
 485         ret
 486
 487 .L_less_than_128_\@:
 488         cbz             arg3, .L_cleanup_\@
 489
 490         movi            v0.16b, #0
 491         mov             v0.s[3], arg1_low32     // get the initial crc value
 492
 493         ldr             q7, [arg2], #0x10
 494 CPU_LE( rev64           v7.16b, v7.16b                  )
 495 CPU_LE( ext             v7.16b, v7.16b, v7.16b, #8      )
 496         eor             v7.16b, v7.16b, v0.16b  // xor the initial crc value
 497
 498         cmp             arg3, #16
 499         b.eq            .L_128_done_\@          // exactly 16 left
 500         b.lt            .L_less_than_16_left_\@
 501
 502         ldr_l           q10, rk1, x8            // rk1 and rk2 in xmm10
 503         __pmull_pre_\p  v10
 504
 505         // update the counter. subtract 32 instead of 16 to save one
 506         // instruction from the loop
 507         subs            arg3, arg3, #32
 508         b.ge            .L_16B_reduction_loop_\@
 509
 510         add             arg3, arg3, #16
 511         b               .L_get_last_two_regs_\@
 512
 513 .L_less_than_16_left_\@:
 514         // shl r9, 4
 515         adr_l           x0, tbl_shf_table + 16
 516         sub             x0, x0, arg3
 517         ld1             {v0.16b}, [x0]
 518         movi            v9.16b, #0x80
 519         eor             v0.16b, v0.16b, v9.16b
 520         tbl             v7.16b, {v7.16b}, v0.16b
 521         b               .L_128_done_\@
 522         .endm
 523
 524 ENTRY(crc_t10dif_pmull_p8)
 525         crc_t10dif_pmull        p8
 526 ENDPROC(crc_t10dif_pmull_p8)
 527
 528         .align          5
 529 ENTRY(crc_t10dif_pmull_p64)
 530         crc_t10dif_pmull        p64
 531 ENDPROC(crc_t10dif_pmull_p64)
 532
 533 // precomputed constants
 534 // these constants are precomputed from the poly:
 535 // 0x8bb70000 (0x8bb7 scaled to 32 bits)
 536         .section        ".rodata", "a"
 537         .align          4
 538 // Q = 0x18BB70000
 539 // rk1 = 2^(32*3) mod Q << 32
 540 // rk2 = 2^(32*5) mod Q << 32
 541 // rk3 = 2^(32*15) mod Q << 32
 542 // rk4 = 2^(32*17) mod Q << 32
 543 // rk5 = 2^(32*3) mod Q << 32
 544 // rk6 = 2^(32*2) mod Q << 32
 545 // rk7 = floor(2^64/Q)
 546 // rk8 = Q
 547
 548 rk1:    .octa           0x06df0000000000002d56000000000000
 549 rk3:    .octa           0x7cf50000000000009d9d000000000000
 550 rk5:    .octa           0x13680000000000002d56000000000000
 551 rk7:    .octa           0x000000018bb7000000000001f65a57f8
 552 rk9:    .octa           0xbfd6000000000000ceae000000000000
 553 rk11:   .octa           0x713c0000000000001e16000000000000
 554 rk13:   .octa           0x80a6000000000000f7f9000000000000
 555 rk15:   .octa           0xe658000000000000044c000000000000
 556 rk17:   .octa           0xa497000000000000ad18000000000000
 557 rk19:   .octa           0xe7b50000000000006ee3000000000000
 558
 559 tbl_shf_table:
 560 // use these values for shift constants for the tbl/tbx instruction
 561 // different alignments result in values as shown:
 562 //      DDQ 0x008f8e8d8c8b8a898887868584838281 # shl 15 (16-1) / shr1
 563 //      DDQ 0x01008f8e8d8c8b8a8988878685848382 # shl 14 (16-3) / shr2
 564 //      DDQ 0x0201008f8e8d8c8b8a89888786858483 # shl 13 (16-4) / shr3
 565 //      DDQ 0x030201008f8e8d8c8b8a898887868584 # shl 12 (16-4) / shr4
 566 //      DDQ 0x04030201008f8e8d8c8b8a8988878685 # shl 11 (16-5) / shr5
 567 //      DDQ 0x0504030201008f8e8d8c8b8a89888786 # shl 10 (16-6) / shr6
 568 //      DDQ 0x060504030201008f8e8d8c8b8a898887 # shl 9  (16-7) / shr7
 569 //      DDQ 0x07060504030201008f8e8d8c8b8a8988 # shl 8  (16-8) / shr8
 570 //      DDQ 0x0807060504030201008f8e8d8c8b8a89 # shl 7  (16-9) / shr9
 571 //      DDQ 0x090807060504030201008f8e8d8c8b8a # shl 6  (16-10) / shr10
 572 //      DDQ 0x0a090807060504030201008f8e8d8c8b # shl 5  (16-11) / shr11
 573 //      DDQ 0x0b0a090807060504030201008f8e8d8c # shl 4  (16-12) / shr12
 574 //      DDQ 0x0c0b0a090807060504030201008f8e8d # shl 3  (16-13) / shr13
 575 //      DDQ 0x0d0c0b0a090807060504030201008f8e # shl 2  (16-14) / shr14
 576 //      DDQ 0x0e0d0c0b0a090807060504030201008f # shl 1  (16-15) / shr15
 577
 578         .byte            0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
 579         .byte           0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
 580         .byte            0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
 581         .byte            0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0