crypto/sha/asm/keccak1600-avx512.pl

   1 #!/usr/bin/env perl
   2 # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8 #
   9 # ====================================================================
  10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11 # project. The module is, however, dual licensed under OpenSSL and
  12 # CRYPTOGAMS licenses depending on where you obtain it. For further
  13 # details see http://www.openssl.org/~appro/cryptogams/.
  14 # ====================================================================
  15 #
  16 # Keccak-1600 for AVX-512F.
  17 #
  18 # July 2017.
  19 #
  20 # Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c).
  21 # Pretty straightforward, the only "magic" is data layout in registers.
  22 # It's impossible to have one that is optimal for every step, hence
  23 # it's changing as algorithm progresses. Data is saved in linear order,
  24 # but in-register order morphs between rounds. Even rounds take in
  25 # linear layout, and odd rounds - transposed, or "verticaly-shaped"...
  26 #
  27 ########################################################################
  28 # Numbers are cycles per processed byte out of large message.
  29 #
  30 #                       r=1088(*)
  31 #
  32 # Knights Landing       7.6
  33 # Skylake-X             5.7
  34 #
  35 # (*)   Corresponds to SHA3-256.
  36
  37 ########################################################################
  38 # Below code is combination of two ideas. One is taken from Keccak Code
  39 # Package, hereafter KCP, and another one from initial version of this
  40 # module. What is common is observation that Pi's input and output are
  41 # "mostly transposed", i.e. if input is aligned by x coordinate, then
  42 # output is [mostly] aligned by y. Both versions, KCP and predecessor,
  43 # were trying to use one of them from round to round, which resulted in
  44 # some kind of transposition in each round. This version still does
  45 # transpose data, but only every second round. Another essential factor
  46 # is that KCP transposition has to be performed with instructions that
  47 # turned to be rather expensive on Knights Landing, both latency- and
  48 # throughput-wise. Not to mention that some of them have to depend on
  49 # each other. On the other hand initial version of this module was
  50 # relying heavily on blend instructions. There were lots of them,
  51 # resulting in higher instruction count, yet it performed better on
  52 # Knights Landing, because processor can execute pair of them each
  53 # cycle and they have minimal latency. This module is an attempt to
  54 # bring best parts together:-)
  55 #
  56 # Coordinates below correspond to those in sha/keccak1600.c. Input
  57 # layout is straight linear:
  58 #
  59 # [0][4] [0][3] [0][2] [0][1] [0][0]
  60 # [1][4] [1][3] [1][2] [1][1] [1][0]
  61 # [2][4] [2][3] [2][2] [2][1] [2][0]
  62 # [3][4] [3][3] [3][2] [3][1] [3][0]
  63 # [4][4] [4][3] [4][2] [4][1] [4][0]
  64 #
  65 # It's perfect for Theta, while Pi is reduced to intra-register
  66 # permutations which yield layout perfect for Chi:
  67 #
  68 # [4][0] [3][0] [2][0] [1][0] [0][0]
  69 # [4][1] [3][1] [2][1] [1][1] [0][1]
  70 # [4][2] [3][2] [2][2] [1][2] [0][2]
  71 # [4][3] [3][3] [2][3] [1][3] [0][3]
  72 # [4][4] [3][4] [2][4] [1][4] [0][4]
  73 #
  74 # Now instead of performing full transposition and feeding it to next
  75 # identical round, we perform kind of diagonal transposition to layout
  76 # from initial version of this module, and make it suitable for Theta:
  77 #
  78 # [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0]
  79 # [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0]
  80 # [4][1] [3][0] [2][4] [1][3] [0][2]>2.1.0.4.3>[2][4] [1][3] [0][2] [4][1] [3][0]
  81 # [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0]
  82 # [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0]
  83 #
  84 # Now intra-register permutations yield initial [almost] straight
  85 # linear layout:
  86 #
  87 # [4][4] [3][3] [2][2] [1][1] [0][0]
  88 ##[0][4] [0][3] [0][2] [0][1] [0][0]
  89 # [3][4] [2][3] [1][2] [0][1] [4][0]
  90 ##[2][3] [2][2] [2][1] [2][0] [2][4]
  91 # [2][4] [1][3] [0][2] [4][1] [3][0]
  92 ##[4][2] [4][1] [4][0] [4][4] [4][3]
  93 # [1][4] [0][3] [4][2] [3][1] [2][0]
  94 ##[1][1] [1][0] [1][4] [1][3] [1][2]
  95 # [0][4] [4][3] [3][2] [2][1] [1][0]
  96 ##[3][0] [3][4] [3][3] [3][2] [3][1]
  97 #
  98 # This means that odd round Chi is performed in less suitable layout,
  99 # with a number of additional permutations. But overall it turned to be
 100 # a win. Permutations are fastest possible on Knights Landing and they
 101 # are laid down to be independent of each other. In the essence I traded
 102 # 20 blend instructions for 3 permutations. The result is 13% faster
 103 # than KCP on Skylake-X, and >40% on Knights Landing.
 104 #
 105 # As implied, data is loaded in straight linear order. Digits in
 106 # variables' names represent coordinates of right-most element of
 107 # loaded data chunk:
 108
 109 my ($A00,       # [0][4] [0][3] [0][2] [0][1] [0][0]
 110     $A10,       # [1][4] [1][3] [1][2] [1][1] [1][0]
 111     $A20,       # [2][4] [2][3] [2][2] [2][1] [2][0]
 112     $A30,       # [3][4] [3][3] [3][2] [3][1] [3][0]
 113     $A40) =     # [4][4] [4][3] [4][2] [4][1] [4][0]
 114     map("%zmm$_",(0..4));
 115
 116 # We also need to map the magic order into offsets within structure:
 117
 118 my @A_jagged = ([0,0], [0,1], [0,2], [0,3], [0,4],
 119                 [1,0], [1,1], [1,2], [1,3], [1,4],
 120                 [2,0], [2,1], [2,2], [2,3], [2,4],
 121                 [3,0], [3,1], [3,2], [3,3], [3,4],
 122                 [4,0], [4,1], [4,2], [4,3], [4,4]);
 123    @A_jagged = map(8*($$_[0]*8+$$_[1]), @A_jagged);     # ... and now linear
 124
 125 my @T        = map("%zmm$_",(5..12));
 126 my @Theta    = map("%zmm$_",(33,13..16));       # invalid @Theta[0] is not typo
 127 my @Pi0      = map("%zmm$_",(17..21));
 128 my @Rhotate0 = map("%zmm$_",(22..26));
 129 my @Rhotate1 = map("%zmm$_",(27..31));
 130
 131 my ($C00,$D00) = @T[0..1];
 132 my ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6));
 133
 134 $code.=<<___;
 135 .text
 136
 137 .type   __KeccakF1600,\@function
 138 .align  32
 139 __KeccakF1600:
 140         lea             iotas(%rip),%r10
 141         mov             \$12,%eax
 142         jmp             .Loop_avx512
 143
 144 .align  32
 145 .Loop_avx512:
 146         ######################################### Theta, even round
 147         vmovdqa64       $A00,@T[0]              # put aside original A00
 148         vpternlogq      \$0x96,$A20,$A10,$A00   # and use it as "C00"
 149         vpternlogq      \$0x96,$A40,$A30,$A00
 150
 151         vprolq          \$1,$A00,$D00
 152         vpermq          $A00,@Theta[1],$A00
 153         vpermq          $D00,@Theta[4],$D00
 154
 155         vpternlogq      \$0x96,$A00,$D00,@T[0]  # T[0] is original A00
 156         vpternlogq      \$0x96,$A00,$D00,$A10
 157         vpternlogq      \$0x96,$A00,$D00,$A20
 158         vpternlogq      \$0x96,$A00,$D00,$A30
 159         vpternlogq      \$0x96,$A00,$D00,$A40
 160
 161         ######################################### Rho
 162         vprolvq         @Rhotate0[0],@T[0],$A00 # T[0] is original A00
 163         vprolvq         @Rhotate0[1],$A10,$A10
 164         vprolvq         @Rhotate0[2],$A20,$A20
 165         vprolvq         @Rhotate0[3],$A30,$A30
 166         vprolvq         @Rhotate0[4],$A40,$A40
 167
 168         ######################################### Pi
 169         vpermq          $A00,@Pi0[0],$A00
 170         vpermq          $A10,@Pi0[1],$A10
 171         vpermq          $A20,@Pi0[2],$A20
 172         vpermq          $A30,@Pi0[3],$A30
 173         vpermq          $A40,@Pi0[4],$A40
 174
 175         ######################################### Chi
 176         vmovdqa64       $A00,@T[0]
 177         vmovdqa64       $A10,@T[1]
 178         vpternlogq      \$0xD2,$A20,$A10,$A00
 179         vpternlogq      \$0xD2,$A30,$A20,$A10
 180         vpternlogq      \$0xD2,$A40,$A30,$A20
 181         vpternlogq      \$0xD2,@T[0],$A40,$A30
 182         vpternlogq      \$0xD2,@T[1],@T[0],$A40
 183
 184         ######################################### Iota
 185         vpxorq          (%r10),$A00,${A00}{$k00001}
 186         lea             16(%r10),%r10
 187
 188         ######################################### Harmonize rounds
 189         vpblendmq       $A20,$A10,@{T[1]}{$k00010}
 190         vpblendmq       $A30,$A20,@{T[2]}{$k00010}
 191         vpblendmq       $A40,$A30,@{T[3]}{$k00010}
 192          vpblendmq      $A10,$A00,@{T[0]}{$k00010}
 193         vpblendmq       $A00,$A40,@{T[4]}{$k00010}
 194
 195         vpblendmq       $A30,@T[1],@{T[1]}{$k00100}
 196         vpblendmq       $A40,@T[2],@{T[2]}{$k00100}
 197          vpblendmq      $A20,@T[0],@{T[0]}{$k00100}
 198         vpblendmq       $A00,@T[3],@{T[3]}{$k00100}
 199         vpblendmq       $A10,@T[4],@{T[4]}{$k00100}
 200
 201         vpblendmq       $A40,@T[1],@{T[1]}{$k01000}
 202          vpblendmq      $A30,@T[0],@{T[0]}{$k01000}
 203         vpblendmq       $A00,@T[2],@{T[2]}{$k01000}
 204         vpblendmq       $A10,@T[3],@{T[3]}{$k01000}
 205         vpblendmq       $A20,@T[4],@{T[4]}{$k01000}
 206
 207         vpblendmq       $A40,@T[0],@{T[0]}{$k10000}
 208         vpblendmq       $A00,@T[1],@{T[1]}{$k10000}
 209         vpblendmq       $A10,@T[2],@{T[2]}{$k10000}
 210         vpblendmq       $A20,@T[3],@{T[3]}{$k10000}
 211         vpblendmq       $A30,@T[4],@{T[4]}{$k10000}
 212
 213         #vpermq         @T[0],@Theta[0],$A00    # doesn't actually change order
 214         vpermq          @T[1],@Theta[1],$A10
 215         vpermq          @T[2],@Theta[2],$A20
 216         vpermq          @T[3],@Theta[3],$A30
 217         vpermq          @T[4],@Theta[4],$A40
 218
 219         ######################################### Theta, odd round
 220         vmovdqa64       $T[0],$A00              # real A00
 221         vpternlogq      \$0x96,$A20,$A10,$C00   # C00 is @T[0]'s alias
 222         vpternlogq      \$0x96,$A40,$A30,$C00
 223
 224         vprolq          \$1,$C00,$D00
 225         vpermq          $C00,@Theta[1],$C00
 226         vpermq          $D00,@Theta[4],$D00
 227
 228         vpternlogq      \$0x96,$C00,$D00,$A00
 229         vpternlogq      \$0x96,$C00,$D00,$A30
 230         vpternlogq      \$0x96,$C00,$D00,$A10
 231         vpternlogq      \$0x96,$C00,$D00,$A40
 232         vpternlogq      \$0x96,$C00,$D00,$A20
 233
 234         ######################################### Rho
 235         vprolvq         @Rhotate1[0],$A00,$A00
 236         vprolvq         @Rhotate1[3],$A30,@T[1]
 237         vprolvq         @Rhotate1[1],$A10,@T[2]
 238         vprolvq         @Rhotate1[4],$A40,@T[3]
 239         vprolvq         @Rhotate1[2],$A20,@T[4]
 240
 241          vpermq         $A00,@Theta[4],@T[5]
 242          vpermq         $A00,@Theta[3],@T[6]
 243
 244         ######################################### Iota
 245         vpxorq          -8(%r10),$A00,${A00}{$k00001}
 246
 247         ######################################### Pi
 248         vpermq          @T[1],@Theta[2],$A10
 249         vpermq          @T[2],@Theta[4],$A20
 250         vpermq          @T[3],@Theta[1],$A30
 251         vpermq          @T[4],@Theta[3],$A40
 252
 253         ######################################### Chi
 254         vpternlogq      \$0xD2,@T[6],@T[5],$A00
 255
 256         vpermq          @T[1],@Theta[1],@T[7]
 257         #vpermq         @T[1],@Theta[0],@T[1]
 258         vpternlogq      \$0xD2,@T[1],@T[7],$A10
 259
 260         vpermq          @T[2],@Theta[3],@T[0]
 261         vpermq          @T[2],@Theta[2],@T[2]
 262         vpternlogq      \$0xD2,@T[2],@T[0],$A20
 263
 264         #vpermq         @T[3],@Theta[0],@T[3]
 265         vpermq          @T[3],@Theta[4],@T[1]
 266         vpternlogq      \$0xD2,@T[1],@T[3],$A30
 267
 268         vpermq          @T[4],@Theta[2],@T[0]
 269         vpermq          @T[4],@Theta[1],@T[4]
 270         vpternlogq      \$0xD2,@T[4],@T[0],$A40
 271
 272         dec             %eax
 273         jnz             .Loop_avx512
 274
 275         ret
 276 .size   __KeccakF1600,.-__KeccakF1600
 277 ___
 278
 279 my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
 280 my  $out = $inp;        # in squeeze
 281
 282 $code.=<<___;
 283 .globl  SHA3_absorb
 284 .type   SHA3_absorb,\@function
 285 .align  32
 286 SHA3_absorb:
 287         mov     %rsp,%r11
 288
 289         lea     -320(%rsp),%rsp
 290         and     \$-64,%rsp
 291
 292         lea     96($A_flat),$A_flat
 293         lea     96($inp),$inp
 294         lea     128(%rsp),%r9
 295
 296         lea             theta_perm(%rip),%r8
 297
 298         kxnorw          $k11111,$k11111,$k11111
 299         kshiftrw        \$15,$k11111,$k00001
 300         kshiftrw        \$11,$k11111,$k11111
 301         kshiftlw        \$1,$k00001,$k00010
 302         kshiftlw        \$2,$k00001,$k00100
 303         kshiftlw        \$3,$k00001,$k01000
 304         kshiftlw        \$4,$k00001,$k10000
 305
 306         #vmovdqa64      64*0(%r8),@Theta[0]
 307         vmovdqa64       64*1(%r8),@Theta[1]
 308         vmovdqa64       64*2(%r8),@Theta[2]
 309         vmovdqa64       64*3(%r8),@Theta[3]
 310         vmovdqa64       64*4(%r8),@Theta[4]
 311
 312         vmovdqa64       64*5(%r8),@Rhotate1[0]
 313         vmovdqa64       64*6(%r8),@Rhotate1[1]
 314         vmovdqa64       64*7(%r8),@Rhotate1[2]
 315         vmovdqa64       64*8(%r8),@Rhotate1[3]
 316         vmovdqa64       64*9(%r8),@Rhotate1[4]
 317
 318         vmovdqa64       64*10(%r8),@Rhotate0[0]
 319         vmovdqa64       64*11(%r8),@Rhotate0[1]
 320         vmovdqa64       64*12(%r8),@Rhotate0[2]
 321         vmovdqa64       64*13(%r8),@Rhotate0[3]
 322         vmovdqa64       64*14(%r8),@Rhotate0[4]
 323
 324         vmovdqa64       64*15(%r8),@Pi0[0]
 325         vmovdqa64       64*16(%r8),@Pi0[1]
 326         vmovdqa64       64*17(%r8),@Pi0[2]
 327         vmovdqa64       64*18(%r8),@Pi0[3]
 328         vmovdqa64       64*19(%r8),@Pi0[4]
 329
 330         vmovdqu64       40*0-96($A_flat),${A00}{$k11111}{z}
 331         vpxorq          @T[0],@T[0],@T[0]
 332         vmovdqu64       40*1-96($A_flat),${A10}{$k11111}{z}
 333         vmovdqu64       40*2-96($A_flat),${A20}{$k11111}{z}
 334         vmovdqu64       40*3-96($A_flat),${A30}{$k11111}{z}
 335         vmovdqu64       40*4-96($A_flat),${A40}{$k11111}{z}
 336
 337         vmovdqa64       @T[0],0*64-128(%r9)     # zero transfer area on stack
 338         vmovdqa64       @T[0],1*64-128(%r9)
 339         vmovdqa64       @T[0],2*64-128(%r9)
 340         vmovdqa64       @T[0],3*64-128(%r9)
 341         vmovdqa64       @T[0],4*64-128(%r9)
 342         jmp             .Loop_absorb_avx512
 343
 344 .align  32
 345 .Loop_absorb_avx512:
 346         mov             $bsz,%rax
 347         sub             $bsz,$len
 348         jc              .Ldone_absorb_avx512
 349
 350         shr             \$3,%eax
 351 ___
 352 for(my $i=0; $i<25; $i++) {
 353 $code.=<<___
 354         mov     8*$i-96($inp),%r8
 355         mov     %r8,$A_jagged[$i]-128(%r9)
 356         dec     %eax
 357         jz      .Labsorved_avx512
 358 ___
 359 }
 360 $code.=<<___;
 361 .Labsorved_avx512:
 362         lea     ($inp,$bsz),$inp
 363
 364         vpxorq  64*0-128(%r9),$A00,$A00
 365         vpxorq  64*1-128(%r9),$A10,$A10
 366         vpxorq  64*2-128(%r9),$A20,$A20
 367         vpxorq  64*3-128(%r9),$A30,$A30
 368         vpxorq  64*4-128(%r9),$A40,$A40
 369
 370         call    __KeccakF1600
 371
 372         jmp     .Loop_absorb_avx512
 373
 374 .align  32
 375 .Ldone_absorb_avx512:
 376         vmovdqu64       $A00,40*0-96($A_flat){$k11111}
 377         vmovdqu64       $A10,40*1-96($A_flat){$k11111}
 378         vmovdqu64       $A20,40*2-96($A_flat){$k11111}
 379         vmovdqu64       $A30,40*3-96($A_flat){$k11111}
 380         vmovdqu64       $A40,40*4-96($A_flat){$k11111}
 381
 382         vzeroupper
 383
 384         lea     (%r11),%rsp
 385         lea     ($len,$bsz),%rax                # return value
 386         ret
 387 .size   SHA3_absorb,.-SHA3_absorb
 388
 389 .globl  SHA3_squeeze
 390 .type   SHA3_squeeze,\@function
 391 .align  32
 392 SHA3_squeeze:
 393         mov     %rsp,%r11
 394
 395         lea     96($A_flat),$A_flat
 396         cmp     $bsz,$len
 397         jbe     .Lno_output_extension_avx512
 398
 399         lea             theta_perm(%rip),%r8
 400
 401         kxnorw          $k11111,$k11111,$k11111
 402         kshiftrw        \$15,$k11111,$k00001
 403         kshiftrw        \$11,$k11111,$k11111
 404         kshiftlw        \$1,$k00001,$k00010
 405         kshiftlw        \$2,$k00001,$k00100
 406         kshiftlw        \$3,$k00001,$k01000
 407         kshiftlw        \$4,$k00001,$k10000
 408
 409         #vmovdqa64      64*0(%r8),@Theta[0]
 410         vmovdqa64       64*1(%r8),@Theta[1]
 411         vmovdqa64       64*2(%r8),@Theta[2]
 412         vmovdqa64       64*3(%r8),@Theta[3]
 413         vmovdqa64       64*4(%r8),@Theta[4]
 414
 415         vmovdqa64       64*5(%r8),@Rhotate1[0]
 416         vmovdqa64       64*6(%r8),@Rhotate1[1]
 417         vmovdqa64       64*7(%r8),@Rhotate1[2]
 418         vmovdqa64       64*8(%r8),@Rhotate1[3]
 419         vmovdqa64       64*9(%r8),@Rhotate1[4]
 420
 421         vmovdqa64       64*10(%r8),@Rhotate0[0]
 422         vmovdqa64       64*11(%r8),@Rhotate0[1]
 423         vmovdqa64       64*12(%r8),@Rhotate0[2]
 424         vmovdqa64       64*13(%r8),@Rhotate0[3]
 425         vmovdqa64       64*14(%r8),@Rhotate0[4]
 426
 427         vmovdqa64       64*15(%r8),@Pi0[0]
 428         vmovdqa64       64*16(%r8),@Pi0[1]
 429         vmovdqa64       64*17(%r8),@Pi0[2]
 430         vmovdqa64       64*18(%r8),@Pi0[3]
 431         vmovdqa64       64*19(%r8),@Pi0[4]
 432
 433         vmovdqu64       40*0-96($A_flat),${A00}{$k11111}{z}
 434         vmovdqu64       40*1-96($A_flat),${A10}{$k11111}{z}
 435         vmovdqu64       40*2-96($A_flat),${A20}{$k11111}{z}
 436         vmovdqu64       40*3-96($A_flat),${A30}{$k11111}{z}
 437         vmovdqu64       40*4-96($A_flat),${A40}{$k11111}{z}
 438
 439 .Lno_output_extension_avx512:
 440         shr     \$3,$bsz
 441         lea     -96($A_flat),%r9
 442         mov     $bsz,%rax
 443         jmp     .Loop_squeeze_avx512
 444
 445 .align  32
 446 .Loop_squeeze_avx512:
 447         cmp     \$8,$len
 448         jb      .Ltail_squeeze_avx512
 449
 450         mov     (%r9),%r8
 451         lea     8(%r9),%r9
 452         mov     %r8,($out)
 453         lea     8($out),$out
 454         sub     \$8,$len                # len -= 8
 455         jz      .Ldone_squeeze_avx512
 456
 457         sub     \$1,%rax                # bsz--
 458         jnz     .Loop_squeeze_avx512
 459
 460         #vpermq         @Theta[4],@Theta[4],@Theta[3]
 461         #vpermq         @Theta[3],@Theta[4],@Theta[2]
 462         #vpermq         @Theta[3],@Theta[3],@Theta[1]
 463
 464         call            __KeccakF1600
 465
 466         vmovdqu64       $A00,40*0-96($A_flat){$k11111}
 467         vmovdqu64       $A10,40*1-96($A_flat){$k11111}
 468         vmovdqu64       $A20,40*2-96($A_flat){$k11111}
 469         vmovdqu64       $A30,40*3-96($A_flat){$k11111}
 470         vmovdqu64       $A40,40*4-96($A_flat){$k11111}
 471
 472         lea     -96($A_flat),%r9
 473         mov     $bsz,%rax
 474         jmp     .Loop_squeeze_avx512
 475
 476 .Ltail_squeeze_avx512:
 477         mov     $out,%rdi
 478         mov     %r9,%rsi
 479         mov     $len,%rcx
 480         .byte   0xf3,0xa4               # rep movsb
 481
 482 .Ldone_squeeze_avx512:
 483         vzeroupper
 484
 485         lea     (%r11),%rsp
 486         ret
 487 .size   SHA3_squeeze,.-SHA3_squeeze
 488
 489 .align  64
 490 theta_perm:
 491         .quad   0, 1, 2, 3, 4, 5, 6, 7          # [not used]
 492         .quad   4, 0, 1, 2, 3, 5, 6, 7
 493         .quad   3, 4, 0, 1, 2, 5, 6, 7
 494         .quad   2, 3, 4, 0, 1, 5, 6, 7
 495         .quad   1, 2, 3, 4, 0, 5, 6, 7
 496
 497 rhotates1:
 498         .quad   0,  44, 43, 21, 14, 0, 0, 0     # [0][0] [1][1] [2][2] [3][3] [4][4]
 499         .quad   18, 1,  6,  25, 8,  0, 0, 0     # [4][0] [0][1] [1][2] [2][3] [3][4]
 500         .quad   41, 2,  62, 55, 39, 0, 0, 0     # [3][0] [4][1] [0][2] [1][3] [2][4]
 501         .quad   3,  45, 61, 28, 20, 0, 0, 0     # [2][0] [3][1] [4][2] [0][3] [1][4]
 502         .quad   36, 10, 15, 56, 27, 0, 0, 0     # [1][0] [2][1] [3][2] [4][3] [0][4]
 503
 504 rhotates0:
 505         .quad    0,  1, 62, 28, 27, 0, 0, 0
 506         .quad   36, 44,  6, 55, 20, 0, 0, 0
 507         .quad    3, 10, 43, 25, 39, 0, 0, 0
 508         .quad   41, 45, 15, 21,  8, 0, 0, 0
 509         .quad   18,  2, 61, 56, 14, 0, 0, 0
 510
 511 pi0_perm:
 512         .quad   0, 3, 1, 4, 2, 5, 6, 7
 513         .quad   1, 4, 2, 0, 3, 5, 6, 7
 514         .quad   2, 0, 3, 1, 4, 5, 6, 7
 515         .quad   3, 1, 4, 2, 0, 5, 6, 7
 516         .quad   4, 2, 0, 3, 1, 5, 6, 7
 517
 518
 519 iotas:
 520         .quad   0x0000000000000001
 521         .quad   0x0000000000008082
 522         .quad   0x800000000000808a
 523         .quad   0x8000000080008000
 524         .quad   0x000000000000808b
 525         .quad   0x0000000080000001
 526         .quad   0x8000000080008081
 527         .quad   0x8000000000008009
 528         .quad   0x000000000000008a
 529         .quad   0x0000000000000088
 530         .quad   0x0000000080008009
 531         .quad   0x000000008000000a
 532         .quad   0x000000008000808b
 533         .quad   0x800000000000008b
 534         .quad   0x8000000000008089
 535         .quad   0x8000000000008003
 536         .quad   0x8000000000008002
 537         .quad   0x8000000000000080
 538         .quad   0x000000000000800a
 539         .quad   0x800000008000000a
 540         .quad   0x8000000080008081
 541         .quad   0x8000000000008080
 542         .quad   0x0000000080000001
 543         .quad   0x8000000080008008
 544
 545 .asciz  "Keccak-1600 absorb and squeeze for AVX-512F, CRYPTOGAMS by <appro\@openssl.org>"
 546 ___
 547
 548 $output=pop and open STDOUT,">$output";
 549 print $code;
 550 close STDOUT;