crypto/sha/asm/keccak1600-x86_64.pl

   1 #!/usr/bin/env perl
   2 # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8 #
   9 # ====================================================================
  10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11 # project. The module is, however, dual licensed under OpenSSL and
  12 # CRYPTOGAMS licenses depending on where you obtain it. For further
  13 # details see http://www.openssl.org/~appro/cryptogams/.
  14 # ====================================================================
  15 #
  16 # Keccak-1600 for x86_64.
  17 #
  18 # June 2017.
  19 #
  20 # Below code is [lane complementing] KECCAK_2X implementation (see
  21 # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
  22 # instead of actually unrolling the loop pair-wise I simply flip
  23 # pointers to T[][] and A[][] at the end of round. Since number of
  24 # rounds is even, last round writes to A[][] and everything works out.
  25 # How does it compare to x86_64 assembly module in Keccak Code Package?
  26 # Depending on processor it's either as fast or faster by up to 15%...
  27 #
  28 ########################################################################
  29 # Numbers are cycles per processed byte out of large message.
  30 #
  31 #                       r=1088(*)
  32 #
  33 # P4                    25.8
  34 # Core 2                12.9
  35 # Westmere              13.7
  36 # Sandy Bridge          12.9(**)
  37 # Haswell               9.6
  38 # Skylake               9.4
  39 # Silvermont            22.8
  40 # Goldmont              15.8
  41 # VIA Nano              17.3
  42 # Sledgehammer          13.3
  43 # Bulldozer             16.5
  44 # Ryzen                 8.8
  45 #
  46 # (*)   Corresponds to SHA3-256. Improvement over compiler-generate
  47 #       varies a lot, most commont coefficient is 15% in comparison to
  48 #       gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
  49 # (**)  Sandy Bridge has broken rotate instruction. Performance can be
  50 #       improved by 14% by replacing rotates with double-precision
  51 #       shift with same register as source and destination.
  52
  53 # $output is the last argument if it looks like a file (it has an extension)
  54 # $flavour is the first argument if it doesn't look like a file
  55 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  56 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  57
  58 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  59
  60 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  61 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  62 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  63 die "can't locate x86_64-xlate.pl";
  64
  65 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  66     or die "can't call $xlate: $!";
  67 *STDOUT=*OUT;
  68
  69 my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
  70               8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
  71
  72 my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
  73 my @D = map("%r$_",(8..12));
  74 my @T = map("%r$_",(13..14));
  75 my $iotas = "%r15";
  76
  77 my @rhotates = ([  0,  1, 62, 28, 27 ],
  78                 [ 36, 44,  6, 55, 20 ],
  79                 [  3, 10, 43, 25, 39 ],
  80                 [ 41, 45, 15, 21,  8 ],
  81                 [ 18,  2, 61, 56, 14 ]);
  82
  83 $code.=<<___;
  84 .text
  85
  86 .type   __KeccakF1600,\@abi-omnipotent
  87 .align  32
  88 __KeccakF1600:
  89 .cfi_startproc
  90         mov     $A[4][0](%rdi),@C[0]
  91         mov     $A[4][1](%rdi),@C[1]
  92         mov     $A[4][2](%rdi),@C[2]
  93         mov     $A[4][3](%rdi),@C[3]
  94         mov     $A[4][4](%rdi),@C[4]
  95         jmp     .Loop
  96
  97 .align  32
  98 .Loop:
  99         mov     $A[0][0](%rdi),@D[0]
 100         mov     $A[1][1](%rdi),@D[1]
 101         mov     $A[2][2](%rdi),@D[2]
 102         mov     $A[3][3](%rdi),@D[3]
 103
 104         xor     $A[0][2](%rdi),@C[2]
 105         xor     $A[0][3](%rdi),@C[3]
 106         xor     @D[0],         @C[0]
 107         xor     $A[0][1](%rdi),@C[1]
 108          xor    $A[1][2](%rdi),@C[2]
 109          xor    $A[1][0](%rdi),@C[0]
 110         mov     @C[4],@D[4]
 111         xor     $A[0][4](%rdi),@C[4]
 112
 113         xor     @D[2],         @C[2]
 114         xor     $A[2][0](%rdi),@C[0]
 115          xor    $A[1][3](%rdi),@C[3]
 116          xor    @D[1],         @C[1]
 117          xor    $A[1][4](%rdi),@C[4]
 118
 119         xor     $A[3][2](%rdi),@C[2]
 120         xor     $A[3][0](%rdi),@C[0]
 121          xor    $A[2][3](%rdi),@C[3]
 122          xor    $A[2][1](%rdi),@C[1]
 123          xor    $A[2][4](%rdi),@C[4]
 124
 125         mov     @C[2],@T[0]
 126         rol     \$1,@C[2]
 127         xor     @C[0],@C[2]             # D[1] = ROL64(C[2], 1) ^ C[0]
 128          xor    @D[3],         @C[3]
 129
 130         rol     \$1,@C[0]
 131         xor     @C[3],@C[0]             # D[4] = ROL64(C[0], 1) ^ C[3]
 132          xor    $A[3][1](%rdi),@C[1]
 133
 134         rol     \$1,@C[3]
 135         xor     @C[1],@C[3]             # D[2] = ROL64(C[3], 1) ^ C[1]
 136          xor    $A[3][4](%rdi),@C[4]
 137
 138         rol     \$1,@C[1]
 139         xor     @C[4],@C[1]             # D[0] = ROL64(C[1], 1) ^ C[4]
 140
 141         rol     \$1,@C[4]
 142         xor     @T[0],@C[4]             # D[3] = ROL64(C[4], 1) ^ C[2]
 143 ___
 144         (@D[0..4], @C) = (@C[1..4,0], @D);
 145 $code.=<<___;
 146         xor     @D[1],@C[1]
 147         xor     @D[2],@C[2]
 148         rol     \$$rhotates[1][1],@C[1]
 149         xor     @D[3],@C[3]
 150         xor     @D[4],@C[4]
 151         rol     \$$rhotates[2][2],@C[2]
 152         xor     @D[0],@C[0]
 153          mov    @C[1],@T[0]
 154         rol     \$$rhotates[3][3],@C[3]
 155          or     @C[2],@C[1]
 156          xor    @C[0],@C[1]             #           C[0] ^ ( C[1] | C[2])
 157         rol     \$$rhotates[4][4],@C[4]
 158
 159          xor    ($iotas),@C[1]
 160          lea    8($iotas),$iotas
 161
 162         mov     @C[4],@T[1]
 163         and     @C[3],@C[4]
 164          mov    @C[1],$A[0][0](%rsi)    # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
 165         xor     @C[2],@C[4]             #           C[2] ^ ( C[4] & C[3])
 166         not     @C[2]
 167         mov     @C[4],$A[0][2](%rsi)    # R[0][2] = C[2] ^ ( C[4] & C[3])
 168
 169         or      @C[3],@C[2]
 170           mov   $A[4][2](%rdi),@C[4]
 171         xor     @T[0],@C[2]             #           C[1] ^ (~C[2] | C[3])
 172         mov     @C[2],$A[0][1](%rsi)    # R[0][1] = C[1] ^ (~C[2] | C[3])
 173
 174         and     @C[0],@T[0]
 175           mov   $A[1][4](%rdi),@C[1]
 176         xor     @T[1],@T[0]             #           C[4] ^ ( C[1] & C[0])
 177           mov   $A[2][0](%rdi),@C[2]
 178         mov     @T[0],$A[0][4](%rsi)    # R[0][4] = C[4] ^ ( C[1] & C[0])
 179
 180         or      @C[0],@T[1]
 181           mov   $A[0][3](%rdi),@C[0]
 182         xor     @C[3],@T[1]             #           C[3] ^ ( C[4] | C[0])
 183           mov   $A[3][1](%rdi),@C[3]
 184         mov     @T[1],$A[0][3](%rsi)    # R[0][3] = C[3] ^ ( C[4] | C[0])
 185
 186
 187         xor     @D[3],@C[0]
 188         xor     @D[2],@C[4]
 189         rol     \$$rhotates[0][3],@C[0]
 190         xor     @D[1],@C[3]
 191         xor     @D[4],@C[1]
 192         rol     \$$rhotates[4][2],@C[4]
 193         rol     \$$rhotates[3][1],@C[3]
 194         xor     @D[0],@C[2]
 195         rol     \$$rhotates[1][4],@C[1]
 196          mov    @C[0],@T[0]
 197          or     @C[4],@C[0]
 198         rol     \$$rhotates[2][0],@C[2]
 199
 200         xor     @C[3],@C[0]             #           C[3] ^ (C[0] |  C[4])
 201         mov     @C[0],$A[1][3](%rsi)    # R[1][3] = C[3] ^ (C[0] |  C[4])
 202
 203         mov     @C[1],@T[1]
 204         and     @T[0],@C[1]
 205           mov   $A[0][1](%rdi),@C[0]
 206         xor     @C[4],@C[1]             #           C[4] ^ (C[1] &  C[0])
 207         not     @C[4]
 208         mov     @C[1],$A[1][4](%rsi)    # R[1][4] = C[4] ^ (C[1] &  C[0])
 209
 210         or      @C[3],@C[4]
 211           mov   $A[1][2](%rdi),@C[1]
 212         xor     @C[2],@C[4]             #           C[2] ^ (~C[4] | C[3])
 213         mov     @C[4],$A[1][2](%rsi)    # R[1][2] = C[2] ^ (~C[4] | C[3])
 214
 215         and     @C[2],@C[3]
 216           mov   $A[4][0](%rdi),@C[4]
 217         xor     @T[1],@C[3]             #           C[1] ^ (C[3] &  C[2])
 218         mov     @C[3],$A[1][1](%rsi)    # R[1][1] = C[1] ^ (C[3] &  C[2])
 219
 220         or      @C[2],@T[1]
 221           mov   $A[2][3](%rdi),@C[2]
 222         xor     @T[0],@T[1]             #           C[0] ^ (C[1] |  C[2])
 223           mov   $A[3][4](%rdi),@C[3]
 224         mov     @T[1],$A[1][0](%rsi)    # R[1][0] = C[0] ^ (C[1] |  C[2])
 225
 226
 227         xor     @D[3],@C[2]
 228         xor     @D[4],@C[3]
 229         rol     \$$rhotates[2][3],@C[2]
 230         xor     @D[2],@C[1]
 231         rol     \$$rhotates[3][4],@C[3]
 232         xor     @D[0],@C[4]
 233         rol     \$$rhotates[1][2],@C[1]
 234         xor     @D[1],@C[0]
 235         rol     \$$rhotates[4][0],@C[4]
 236          mov    @C[2],@T[0]
 237          and    @C[3],@C[2]
 238         rol     \$$rhotates[0][1],@C[0]
 239
 240         not     @C[3]
 241         xor     @C[1],@C[2]             #            C[1] ^ ( C[2] & C[3])
 242         mov     @C[2],$A[2][1](%rsi)    # R[2][1] =  C[1] ^ ( C[2] & C[3])
 243
 244         mov     @C[4],@T[1]
 245         and     @C[3],@C[4]
 246           mov   $A[2][1](%rdi),@C[2]
 247         xor     @T[0],@C[4]             #            C[2] ^ ( C[4] & ~C[3])
 248         mov     @C[4],$A[2][2](%rsi)    # R[2][2] =  C[2] ^ ( C[4] & ~C[3])
 249
 250         or      @C[1],@T[0]
 251           mov   $A[4][3](%rdi),@C[4]
 252         xor     @C[0],@T[0]             #            C[0] ^ ( C[2] | C[1])
 253         mov     @T[0],$A[2][0](%rsi)    # R[2][0] =  C[0] ^ ( C[2] | C[1])
 254
 255         and     @C[0],@C[1]
 256         xor     @T[1],@C[1]             #            C[4] ^ ( C[1] & C[0])
 257         mov     @C[1],$A[2][4](%rsi)    # R[2][4] =  C[4] ^ ( C[1] & C[0])
 258
 259         or      @C[0],@T[1]
 260           mov   $A[1][0](%rdi),@C[1]
 261         xor     @C[3],@T[1]             #           ~C[3] ^ ( C[0] | C[4])
 262           mov   $A[3][2](%rdi),@C[3]
 263         mov     @T[1],$A[2][3](%rsi)    # R[2][3] = ~C[3] ^ ( C[0] | C[4])
 264
 265
 266         mov     $A[0][4](%rdi),@C[0]
 267
 268         xor     @D[1],@C[2]
 269         xor     @D[2],@C[3]
 270         rol     \$$rhotates[2][1],@C[2]
 271         xor     @D[0],@C[1]
 272         rol     \$$rhotates[3][2],@C[3]
 273         xor     @D[3],@C[4]
 274         rol     \$$rhotates[1][0],@C[1]
 275         xor     @D[4],@C[0]
 276         rol     \$$rhotates[4][3],@C[4]
 277          mov    @C[2],@T[0]
 278          or     @C[3],@C[2]
 279         rol     \$$rhotates[0][4],@C[0]
 280
 281         not     @C[3]
 282         xor     @C[1],@C[2]             #            C[1] ^ ( C[2] | C[3])
 283         mov     @C[2],$A[3][1](%rsi)    # R[3][1] =  C[1] ^ ( C[2] | C[3])
 284
 285         mov     @C[4],@T[1]
 286         or      @C[3],@C[4]
 287         xor     @T[0],@C[4]             #            C[2] ^ ( C[4] | ~C[3])
 288         mov     @C[4],$A[3][2](%rsi)    # R[3][2] =  C[2] ^ ( C[4] | ~C[3])
 289
 290         and     @C[1],@T[0]
 291         xor     @C[0],@T[0]             #            C[0] ^ ( C[2] & C[1])
 292         mov     @T[0],$A[3][0](%rsi)    # R[3][0] =  C[0] ^ ( C[2] & C[1])
 293
 294         or      @C[0],@C[1]
 295         xor     @T[1],@C[1]             #            C[4] ^ ( C[1] | C[0])
 296         mov     @C[1],$A[3][4](%rsi)    # R[3][4] =  C[4] ^ ( C[1] | C[0])
 297
 298         and     @T[1],@C[0]
 299         xor     @C[3],@C[0]             #           ~C[3] ^ ( C[0] & C[4])
 300         mov     @C[0],$A[3][3](%rsi)    # R[3][3] = ~C[3] ^ ( C[0] & C[4])
 301
 302
 303         xor     $A[0][2](%rdi),@D[2]
 304         xor     $A[1][3](%rdi),@D[3]
 305         rol     \$$rhotates[0][2],@D[2]
 306         xor     $A[4][1](%rdi),@D[1]
 307         rol     \$$rhotates[1][3],@D[3]
 308         xor     $A[2][4](%rdi),@D[4]
 309         rol     \$$rhotates[4][1],@D[1]
 310         xor     $A[3][0](%rdi),@D[0]
 311         xchg    %rsi,%rdi
 312         rol     \$$rhotates[2][4],@D[4]
 313         rol     \$$rhotates[3][0],@D[0]
 314 ___
 315         @C = @D[2..4,0,1];
 316 $code.=<<___;
 317         mov     @C[0],@T[0]
 318         and     @C[1],@C[0]
 319         not     @C[1]
 320         xor     @C[4],@C[0]             #            C[4] ^ ( C[0] & C[1])
 321         mov     @C[0],$A[4][4](%rdi)    # R[4][4] =  C[4] ^ ( C[0] & C[1])
 322
 323         mov     @C[2],@T[1]
 324         and     @C[1],@C[2]
 325         xor     @T[0],@C[2]             #            C[0] ^ ( C[2] & ~C[1])
 326         mov     @C[2],$A[4][0](%rdi)    # R[4][0] =  C[0] ^ ( C[2] & ~C[1])
 327
 328         or      @C[4],@T[0]
 329         xor     @C[3],@T[0]             #            C[3] ^ ( C[0] | C[4])
 330         mov     @T[0],$A[4][3](%rdi)    # R[4][3] =  C[3] ^ ( C[0] | C[4])
 331
 332         and     @C[3],@C[4]
 333         xor     @T[1],@C[4]             #            C[2] ^ ( C[4] & C[3])
 334         mov     @C[4],$A[4][2](%rdi)    # R[4][2] =  C[2] ^ ( C[4] & C[3])
 335
 336         or      @T[1],@C[3]
 337         xor     @C[1],@C[3]             #           ~C[1] ^ ( C[2] | C[3])
 338         mov     @C[3],$A[4][1](%rdi)    # R[4][1] = ~C[1] ^ ( C[2] | C[3])
 339
 340         mov     @C[0],@C[1]             # harmonize with the loop top
 341         mov     @T[0],@C[0]
 342
 343         test    \$255,$iotas
 344         jnz     .Loop
 345
 346         lea     -192($iotas),$iotas     # rewind iotas
 347         ret
 348 .cfi_endproc
 349 .size   __KeccakF1600,.-__KeccakF1600
 350
 351 .type   KeccakF1600,\@abi-omnipotent
 352 .align  32
 353 KeccakF1600:
 354 .cfi_startproc
 355         push    %rbx
 356 .cfi_push       %rbx
 357         push    %rbp
 358 .cfi_push       %rbp
 359         push    %r12
 360 .cfi_push       %r12
 361         push    %r13
 362 .cfi_push       %r13
 363         push    %r14
 364 .cfi_push       %r14
 365         push    %r15
 366 .cfi_push       %r15
 367
 368         lea     100(%rdi),%rdi          # size optimization
 369         sub     \$200,%rsp
 370 .cfi_adjust_cfa_offset  200
 371
 372         notq    $A[0][1](%rdi)
 373         notq    $A[0][2](%rdi)
 374         notq    $A[1][3](%rdi)
 375         notq    $A[2][2](%rdi)
 376         notq    $A[3][2](%rdi)
 377         notq    $A[4][0](%rdi)
 378
 379         lea     iotas(%rip),$iotas
 380         lea     100(%rsp),%rsi          # size optimization
 381
 382         call    __KeccakF1600
 383
 384         notq    $A[0][1](%rdi)
 385         notq    $A[0][2](%rdi)
 386         notq    $A[1][3](%rdi)
 387         notq    $A[2][2](%rdi)
 388         notq    $A[3][2](%rdi)
 389         notq    $A[4][0](%rdi)
 390         lea     -100(%rdi),%rdi         # preserve A[][]
 391
 392         add     \$200,%rsp
 393 .cfi_adjust_cfa_offset  -200
 394
 395         pop     %r15
 396 .cfi_pop        %r15
 397         pop     %r14
 398 .cfi_pop        %r14
 399         pop     %r13
 400 .cfi_pop        %r13
 401         pop     %r12
 402 .cfi_pop        %r12
 403         pop     %rbp
 404 .cfi_pop        %rbp
 405         pop     %rbx
 406 .cfi_pop        %rbx
 407         ret
 408 .cfi_endproc
 409 .size   KeccakF1600,.-KeccakF1600
 410 ___
 411
 412 { my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
 413      ($A_flat,$inp) = ("%r8","%r9");
 414 $code.=<<___;
 415 .globl  SHA3_absorb
 416 .type   SHA3_absorb,\@function,4
 417 .align  32
 418 SHA3_absorb:
 419 .cfi_startproc
 420         push    %rbx
 421 .cfi_push       %rbx
 422         push    %rbp
 423 .cfi_push       %rbp
 424         push    %r12
 425 .cfi_push       %r12
 426         push    %r13
 427 .cfi_push       %r13
 428         push    %r14
 429 .cfi_push       %r14
 430         push    %r15
 431 .cfi_push       %r15
 432
 433         lea     100(%rdi),%rdi          # size optimization
 434         sub     \$232,%rsp
 435 .cfi_adjust_cfa_offset  232
 436
 437         mov     %rsi,$inp
 438         lea     100(%rsp),%rsi          # size optimization
 439
 440         notq    $A[0][1](%rdi)
 441         notq    $A[0][2](%rdi)
 442         notq    $A[1][3](%rdi)
 443         notq    $A[2][2](%rdi)
 444         notq    $A[3][2](%rdi)
 445         notq    $A[4][0](%rdi)
 446         lea     iotas(%rip),$iotas
 447
 448         mov     $bsz,216-100(%rsi)      # save bsz
 449
 450 .Loop_absorb:
 451         cmp     $bsz,$len
 452         jc      .Ldone_absorb
 453
 454         shr     \$3,$bsz
 455         lea     -100(%rdi),$A_flat
 456
 457 .Lblock_absorb:
 458         mov     ($inp),%rax
 459         lea     8($inp),$inp
 460         xor     ($A_flat),%rax
 461         lea     8($A_flat),$A_flat
 462         sub     \$8,$len
 463         mov     %rax,-8($A_flat)
 464         sub     \$1,$bsz
 465         jnz     .Lblock_absorb
 466
 467         mov     $inp,200-100(%rsi)      # save inp
 468         mov     $len,208-100(%rsi)      # save len
 469         call    __KeccakF1600
 470         mov     200-100(%rsi),$inp      # pull inp
 471         mov     208-100(%rsi),$len      # pull len
 472         mov     216-100(%rsi),$bsz      # pull bsz
 473         jmp     .Loop_absorb
 474
 475 .align  32
 476 .Ldone_absorb:
 477         mov     $len,%rax               # return value
 478
 479         notq    $A[0][1](%rdi)
 480         notq    $A[0][2](%rdi)
 481         notq    $A[1][3](%rdi)
 482         notq    $A[2][2](%rdi)
 483         notq    $A[3][2](%rdi)
 484         notq    $A[4][0](%rdi)
 485
 486         add     \$232,%rsp
 487 .cfi_adjust_cfa_offset  -232
 488
 489         pop     %r15
 490 .cfi_pop        %r15
 491         pop     %r14
 492 .cfi_pop        %r14
 493         pop     %r13
 494 .cfi_pop        %r13
 495         pop     %r12
 496 .cfi_pop        %r12
 497         pop     %rbp
 498 .cfi_pop        %rbp
 499         pop     %rbx
 500 .cfi_pop        %rbx
 501         ret
 502 .cfi_endproc
 503 .size   SHA3_absorb,.-SHA3_absorb
 504 ___
 505 }
 506 { my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
 507      ($out,$len,$bsz) = ("%r12","%r13","%r14");
 508
 509 $code.=<<___;
 510 .globl  SHA3_squeeze
 511 .type   SHA3_squeeze,\@function,4
 512 .align  32
 513 SHA3_squeeze:
 514 .cfi_startproc
 515         push    %r12
 516 .cfi_push       %r12
 517         push    %r13
 518 .cfi_push       %r13
 519         push    %r14
 520 .cfi_push       %r14
 521
 522         shr     \$3,%rcx
 523         mov     $A_flat,%r8
 524         mov     %rsi,$out
 525         mov     %rdx,$len
 526         mov     %rcx,$bsz
 527         jmp     .Loop_squeeze
 528
 529 .align  32
 530 .Loop_squeeze:
 531         cmp     \$8,$len
 532         jb      .Ltail_squeeze
 533
 534         mov     (%r8),%rax
 535         lea     8(%r8),%r8
 536         mov     %rax,($out)
 537         lea     8($out),$out
 538         sub     \$8,$len                # len -= 8
 539         jz      .Ldone_squeeze
 540
 541         sub     \$1,%rcx                # bsz--
 542         jnz     .Loop_squeeze
 543
 544         call    KeccakF1600
 545         mov     $A_flat,%r8
 546         mov     $bsz,%rcx
 547         jmp     .Loop_squeeze
 548
 549 .Ltail_squeeze:
 550         mov     %r8, %rsi
 551         mov     $out,%rdi
 552         mov     $len,%rcx
 553         .byte   0xf3,0xa4               # rep   movsb
 554
 555 .Ldone_squeeze:
 556         pop     %r14
 557 .cfi_pop        %r14
 558         pop     %r13
 559 .cfi_pop        %r13
 560         pop     %r12
 561 .cfi_pop        %r13
 562         ret
 563 .cfi_endproc
 564 .size   SHA3_squeeze,.-SHA3_squeeze
 565 ___
 566 }
 567 $code.=<<___;
 568 .align  256
 569         .quad   0,0,0,0,0,0,0,0
 570 .type   iotas,\@object
 571 iotas:
 572         .quad   0x0000000000000001
 573         .quad   0x0000000000008082
 574         .quad   0x800000000000808a
 575         .quad   0x8000000080008000
 576         .quad   0x000000000000808b
 577         .quad   0x0000000080000001
 578         .quad   0x8000000080008081
 579         .quad   0x8000000000008009
 580         .quad   0x000000000000008a
 581         .quad   0x0000000000000088
 582         .quad   0x0000000080008009
 583         .quad   0x000000008000000a
 584         .quad   0x000000008000808b
 585         .quad   0x800000000000008b
 586         .quad   0x8000000000008089
 587         .quad   0x8000000000008003
 588         .quad   0x8000000000008002
 589         .quad   0x8000000000000080
 590         .quad   0x000000000000800a
 591         .quad   0x800000008000000a
 592         .quad   0x8000000080008081
 593         .quad   0x8000000000008080
 594         .quad   0x0000000080000001
 595         .quad   0x8000000080008008
 596 .size   iotas,.-iotas
 597 .asciz  "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 598 ___
 599
 600 foreach (split("\n",$code)) {
 601         # Below replacement results in 11.2 on Sandy Bridge, 9.4 on
 602         # Haswell, but it hurts other processors by up to 2-3-4x...
 603         #s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
 604         # Below replacement results in 9.3 on Haswell [as well as
 605         # on Ryzen, i.e. it *hurts* Ryzen]...
 606         #s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/;
 607
 608         print $_, "\n";
 609 }
 610
 611 close STDOUT or die "error closing STDOUT";