crypto/sha/asm/keccak1600-x86_64.pl

   1 #!/usr/bin/env perl
   2 # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8 #
   9 # ====================================================================
  10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11 # project. The module is, however, dual licensed under OpenSSL and
  12 # CRYPTOGAMS licenses depending on where you obtain it. For further
  13 # details see http://www.openssl.org/~appro/cryptogams/.
  14 # ====================================================================
  15 #
  16 # Keccak-1600 for x86_64.
  17 #
  18 # June 2017.
  19 #
  20 # Below code is [lane complementing] KECCAK_2X implementation (see
  21 # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
  22 # instead of actually unrolling the loop pair-wise I simply flip
  23 # pointers to T[][] and A[][] at the end of round. Since number of
  24 # rounds is even, last round writes to A[][] and everything works out.
  25 # How does it compare to x86_64 assembly module in Keccak Code Package?
  26 # Depending on processor it's either as fast or faster by up to 15%...
  27 #
  28 ########################################################################
  29 # Numbers are cycles per processed byte out of large message.
  30 #
  31 #                       r=1088(*)
  32 #
  33 # P4                    25.8
  34 # Core 2                12.9
  35 # Westmere              13.7
  36 # Sandy Bridge          12.9(**)
  37 # Haswell               9.6
  38 # Skylake               9.4
  39 # Silvermont            22.8
  40 # Goldmont              15.8
  41 # VIA Nano              17.3
  42 # Sledgehammer          13.3
  43 # Bulldozer             16.5
  44 # Ryzen                 8.8
  45 #
  46 # (*)   Corresponds to SHA3-256. Improvement over compiler-generate
  47 #       varies a lot, most commont coefficient is 15% in comparison to
  48 #       gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
  49 # (**)  Sandy Bridge has broken rotate instruction. Performance can be
  50 #       improved by 14% by replacing rotates with double-precision
  51 #       shift with same register as source and destination.
  52
  53 $flavour = shift;
  54 $output  = shift;
  55 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  56
  57 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  58
  59 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  60 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  61 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  62 die "can't locate x86_64-xlate.pl";
  63
  64 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  65 *STDOUT=*OUT;
  66
  67 my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
  68               8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
  69
  70 my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
  71 my @D = map("%r$_",(8..12));
  72 my @T = map("%r$_",(13..14));
  73 my $iotas = "%r15";
  74
  75 my @rhotates = ([  0,  1, 62, 28, 27 ],
  76                 [ 36, 44,  6, 55, 20 ],
  77                 [  3, 10, 43, 25, 39 ],
  78                 [ 41, 45, 15, 21,  8 ],
  79                 [ 18,  2, 61, 56, 14 ]);
  80
  81 $code.=<<___;
  82 .text
  83
  84 .type   __KeccakF1600,\@abi-omnipotent
  85 .align  32
  86 __KeccakF1600:
  87         mov     $A[4][0](%rdi),@C[0]
  88         mov     $A[4][1](%rdi),@C[1]
  89         mov     $A[4][2](%rdi),@C[2]
  90         mov     $A[4][3](%rdi),@C[3]
  91         mov     $A[4][4](%rdi),@C[4]
  92         jmp     .Loop
  93
  94 .align  32
  95 .Loop:
  96         mov     $A[0][0](%rdi),@D[0]
  97         mov     $A[1][1](%rdi),@D[1]
  98         mov     $A[2][2](%rdi),@D[2]
  99         mov     $A[3][3](%rdi),@D[3]
 100
 101         xor     $A[0][2](%rdi),@C[2]
 102         xor     $A[0][3](%rdi),@C[3]
 103         xor     @D[0],         @C[0]
 104         xor     $A[0][1](%rdi),@C[1]
 105          xor    $A[1][2](%rdi),@C[2]
 106          xor    $A[1][0](%rdi),@C[0]
 107         mov     @C[4],@D[4]
 108         xor     $A[0][4](%rdi),@C[4]
 109
 110         xor     @D[2],         @C[2]
 111         xor     $A[2][0](%rdi),@C[0]
 112          xor    $A[1][3](%rdi),@C[3]
 113          xor    @D[1],         @C[1]
 114          xor    $A[1][4](%rdi),@C[4]
 115
 116         xor     $A[3][2](%rdi),@C[2]
 117         xor     $A[3][0](%rdi),@C[0]
 118          xor    $A[2][3](%rdi),@C[3]
 119          xor    $A[2][1](%rdi),@C[1]
 120          xor    $A[2][4](%rdi),@C[4]
 121
 122         mov     @C[2],@T[0]
 123         rol     \$1,@C[2]
 124         xor     @C[0],@C[2]             # D[1] = ROL64(C[2], 1) ^ C[0]
 125          xor    @D[3],         @C[3]
 126
 127         rol     \$1,@C[0]
 128         xor     @C[3],@C[0]             # D[4] = ROL64(C[0], 1) ^ C[3]
 129          xor    $A[3][1](%rdi),@C[1]
 130
 131         rol     \$1,@C[3]
 132         xor     @C[1],@C[3]             # D[2] = ROL64(C[3], 1) ^ C[1]
 133          xor    $A[3][4](%rdi),@C[4]
 134
 135         rol     \$1,@C[1]
 136         xor     @C[4],@C[1]             # D[0] = ROL64(C[1], 1) ^ C[4]
 137
 138         rol     \$1,@C[4]
 139         xor     @T[0],@C[4]             # D[3] = ROL64(C[4], 1) ^ C[2]
 140 ___
 141         (@D[0..4], @C) = (@C[1..4,0], @D);
 142 $code.=<<___;
 143         xor     @D[1],@C[1]
 144         xor     @D[2],@C[2]
 145         rol     \$$rhotates[1][1],@C[1]
 146         xor     @D[3],@C[3]
 147         xor     @D[4],@C[4]
 148         rol     \$$rhotates[2][2],@C[2]
 149         xor     @D[0],@C[0]
 150          mov    @C[1],@T[0]
 151         rol     \$$rhotates[3][3],@C[3]
 152          or     @C[2],@C[1]
 153          xor    @C[0],@C[1]             #           C[0] ^ ( C[1] | C[2])
 154         rol     \$$rhotates[4][4],@C[4]
 155
 156          xor    ($iotas),@C[1]
 157          lea    8($iotas),$iotas
 158
 159         mov     @C[4],@T[1]
 160         and     @C[3],@C[4]
 161          mov    @C[1],$A[0][0](%rsi)    # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
 162         xor     @C[2],@C[4]             #           C[2] ^ ( C[4] & C[3])
 163         not     @C[2]
 164         mov     @C[4],$A[0][2](%rsi)    # R[0][2] = C[2] ^ ( C[4] & C[3])
 165
 166         or      @C[3],@C[2]
 167           mov   $A[4][2](%rdi),@C[4]
 168         xor     @T[0],@C[2]             #           C[1] ^ (~C[2] | C[3])
 169         mov     @C[2],$A[0][1](%rsi)    # R[0][1] = C[1] ^ (~C[2] | C[3])
 170
 171         and     @C[0],@T[0]
 172           mov   $A[1][4](%rdi),@C[1]
 173         xor     @T[1],@T[0]             #           C[4] ^ ( C[1] & C[0])
 174           mov   $A[2][0](%rdi),@C[2]
 175         mov     @T[0],$A[0][4](%rsi)    # R[0][4] = C[4] ^ ( C[1] & C[0])
 176
 177         or      @C[0],@T[1]
 178           mov   $A[0][3](%rdi),@C[0]
 179         xor     @C[3],@T[1]             #           C[3] ^ ( C[4] | C[0])
 180           mov   $A[3][1](%rdi),@C[3]
 181         mov     @T[1],$A[0][3](%rsi)    # R[0][3] = C[3] ^ ( C[4] | C[0])
 182
 183
 184         xor     @D[3],@C[0]
 185         xor     @D[2],@C[4]
 186         rol     \$$rhotates[0][3],@C[0]
 187         xor     @D[1],@C[3]
 188         xor     @D[4],@C[1]
 189         rol     \$$rhotates[4][2],@C[4]
 190         rol     \$$rhotates[3][1],@C[3]
 191         xor     @D[0],@C[2]
 192         rol     \$$rhotates[1][4],@C[1]
 193          mov    @C[0],@T[0]
 194          or     @C[4],@C[0]
 195         rol     \$$rhotates[2][0],@C[2]
 196
 197         xor     @C[3],@C[0]             #           C[3] ^ (C[0] |  C[4])
 198         mov     @C[0],$A[1][3](%rsi)    # R[1][3] = C[3] ^ (C[0] |  C[4])
 199
 200         mov     @C[1],@T[1]
 201         and     @T[0],@C[1]
 202           mov   $A[0][1](%rdi),@C[0]
 203         xor     @C[4],@C[1]             #           C[4] ^ (C[1] &  C[0])
 204         not     @C[4]
 205         mov     @C[1],$A[1][4](%rsi)    # R[1][4] = C[4] ^ (C[1] &  C[0])
 206
 207         or      @C[3],@C[4]
 208           mov   $A[1][2](%rdi),@C[1]
 209         xor     @C[2],@C[4]             #           C[2] ^ (~C[4] | C[3])
 210         mov     @C[4],$A[1][2](%rsi)    # R[1][2] = C[2] ^ (~C[4] | C[3])
 211
 212         and     @C[2],@C[3]
 213           mov   $A[4][0](%rdi),@C[4]
 214         xor     @T[1],@C[3]             #           C[1] ^ (C[3] &  C[2])
 215         mov     @C[3],$A[1][1](%rsi)    # R[1][1] = C[1] ^ (C[3] &  C[2])
 216
 217         or      @C[2],@T[1]
 218           mov   $A[2][3](%rdi),@C[2]
 219         xor     @T[0],@T[1]             #           C[0] ^ (C[1] |  C[2])
 220           mov   $A[3][4](%rdi),@C[3]
 221         mov     @T[1],$A[1][0](%rsi)    # R[1][0] = C[0] ^ (C[1] |  C[2])
 222
 223
 224         xor     @D[3],@C[2]
 225         xor     @D[4],@C[3]
 226         rol     \$$rhotates[2][3],@C[2]
 227         xor     @D[2],@C[1]
 228         rol     \$$rhotates[3][4],@C[3]
 229         xor     @D[0],@C[4]
 230         rol     \$$rhotates[1][2],@C[1]
 231         xor     @D[1],@C[0]
 232         rol     \$$rhotates[4][0],@C[4]
 233          mov    @C[2],@T[0]
 234          and    @C[3],@C[2]
 235         rol     \$$rhotates[0][1],@C[0]
 236
 237         not     @C[3]
 238         xor     @C[1],@C[2]             #            C[1] ^ ( C[2] & C[3])
 239         mov     @C[2],$A[2][1](%rsi)    # R[2][1] =  C[1] ^ ( C[2] & C[3])
 240
 241         mov     @C[4],@T[1]
 242         and     @C[3],@C[4]
 243           mov   $A[2][1](%rdi),@C[2]
 244         xor     @T[0],@C[4]             #            C[2] ^ ( C[4] & ~C[3])
 245         mov     @C[4],$A[2][2](%rsi)    # R[2][2] =  C[2] ^ ( C[4] & ~C[3])
 246
 247         or      @C[1],@T[0]
 248           mov   $A[4][3](%rdi),@C[4]
 249         xor     @C[0],@T[0]             #            C[0] ^ ( C[2] | C[1])
 250         mov     @T[0],$A[2][0](%rsi)    # R[2][0] =  C[0] ^ ( C[2] | C[1])
 251
 252         and     @C[0],@C[1]
 253         xor     @T[1],@C[1]             #            C[4] ^ ( C[1] & C[0])
 254         mov     @C[1],$A[2][4](%rsi)    # R[2][4] =  C[4] ^ ( C[1] & C[0])
 255
 256         or      @C[0],@T[1]
 257           mov   $A[1][0](%rdi),@C[1]
 258         xor     @C[3],@T[1]             #           ~C[3] ^ ( C[0] | C[4])
 259           mov   $A[3][2](%rdi),@C[3]
 260         mov     @T[1],$A[2][3](%rsi)    # R[2][3] = ~C[3] ^ ( C[0] | C[4])
 261
 262
 263         mov     $A[0][4](%rdi),@C[0]
 264
 265         xor     @D[1],@C[2]
 266         xor     @D[2],@C[3]
 267         rol     \$$rhotates[2][1],@C[2]
 268         xor     @D[0],@C[1]
 269         rol     \$$rhotates[3][2],@C[3]
 270         xor     @D[3],@C[4]
 271         rol     \$$rhotates[1][0],@C[1]
 272         xor     @D[4],@C[0]
 273         rol     \$$rhotates[4][3],@C[4]
 274          mov    @C[2],@T[0]
 275          or     @C[3],@C[2]
 276         rol     \$$rhotates[0][4],@C[0]
 277
 278         not     @C[3]
 279         xor     @C[1],@C[2]             #            C[1] ^ ( C[2] | C[3])
 280         mov     @C[2],$A[3][1](%rsi)    # R[3][1] =  C[1] ^ ( C[2] | C[3])
 281
 282         mov     @C[4],@T[1]
 283         or      @C[3],@C[4]
 284         xor     @T[0],@C[4]             #            C[2] ^ ( C[4] | ~C[3])
 285         mov     @C[4],$A[3][2](%rsi)    # R[3][2] =  C[2] ^ ( C[4] | ~C[3])
 286
 287         and     @C[1],@T[0]
 288         xor     @C[0],@T[0]             #            C[0] ^ ( C[2] & C[1])
 289         mov     @T[0],$A[3][0](%rsi)    # R[3][0] =  C[0] ^ ( C[2] & C[1])
 290
 291         or      @C[0],@C[1]
 292         xor     @T[1],@C[1]             #            C[4] ^ ( C[1] | C[0])
 293         mov     @C[1],$A[3][4](%rsi)    # R[3][4] =  C[4] ^ ( C[1] | C[0])
 294
 295         and     @T[1],@C[0]
 296         xor     @C[3],@C[0]             #           ~C[3] ^ ( C[0] & C[4])
 297         mov     @C[0],$A[3][3](%rsi)    # R[3][3] = ~C[3] ^ ( C[0] & C[4])
 298
 299
 300         xor     $A[0][2](%rdi),@D[2]
 301         xor     $A[1][3](%rdi),@D[3]
 302         rol     \$$rhotates[0][2],@D[2]
 303         xor     $A[4][1](%rdi),@D[1]
 304         rol     \$$rhotates[1][3],@D[3]
 305         xor     $A[2][4](%rdi),@D[4]
 306         rol     \$$rhotates[4][1],@D[1]
 307         xor     $A[3][0](%rdi),@D[0]
 308         xchg    %rsi,%rdi
 309         rol     \$$rhotates[2][4],@D[4]
 310         rol     \$$rhotates[3][0],@D[0]
 311 ___
 312         @C = @D[2..4,0,1];
 313 $code.=<<___;
 314         mov     @C[0],@T[0]
 315         and     @C[1],@C[0]
 316         not     @C[1]
 317         xor     @C[4],@C[0]             #            C[4] ^ ( C[0] & C[1])
 318         mov     @C[0],$A[4][4](%rdi)    # R[4][4] =  C[4] ^ ( C[0] & C[1])
 319
 320         mov     @C[2],@T[1]
 321         and     @C[1],@C[2]
 322         xor     @T[0],@C[2]             #            C[0] ^ ( C[2] & ~C[1])
 323         mov     @C[2],$A[4][0](%rdi)    # R[4][0] =  C[0] ^ ( C[2] & ~C[1])
 324
 325         or      @C[4],@T[0]
 326         xor     @C[3],@T[0]             #            C[3] ^ ( C[0] | C[4])
 327         mov     @T[0],$A[4][3](%rdi)    # R[4][3] =  C[3] ^ ( C[0] | C[4])
 328
 329         and     @C[3],@C[4]
 330         xor     @T[1],@C[4]             #            C[2] ^ ( C[4] & C[3])
 331         mov     @C[4],$A[4][2](%rdi)    # R[4][2] =  C[2] ^ ( C[4] & C[3])
 332
 333         or      @T[1],@C[3]
 334         xor     @C[1],@C[3]             #           ~C[1] ^ ( C[2] | C[3])
 335         mov     @C[3],$A[4][1](%rdi)    # R[4][1] = ~C[1] ^ ( C[2] | C[3])
 336
 337         mov     @C[0],@C[1]             # harmonize with the loop top
 338         mov     @T[0],@C[0]
 339
 340         test    \$255,$iotas
 341         jnz     .Loop
 342
 343         lea     -192($iotas),$iotas     # rewind iotas
 344         ret
 345 .size   __KeccakF1600,.-__KeccakF1600
 346
 347 .type   KeccakF1600,\@abi-omnipotent
 348 .align  32
 349 KeccakF1600:
 350 .cfi_startproc
 351         push    %rbx
 352 .cfi_push       %rbx
 353         push    %rbp
 354 .cfi_push       %rbp
 355         push    %r12
 356 .cfi_push       %r12
 357         push    %r13
 358 .cfi_push       %r13
 359         push    %r14
 360 .cfi_push       %r14
 361         push    %r15
 362 .cfi_push       %r15
 363
 364         lea     100(%rdi),%rdi          # size optimization
 365         sub     \$200,%rsp
 366 .cfi_adjust_cfa_offset  200
 367
 368         notq    $A[0][1](%rdi)
 369         notq    $A[0][2](%rdi)
 370         notq    $A[1][3](%rdi)
 371         notq    $A[2][2](%rdi)
 372         notq    $A[3][2](%rdi)
 373         notq    $A[4][0](%rdi)
 374
 375         lea     iotas(%rip),$iotas
 376         lea     100(%rsp),%rsi          # size optimization
 377
 378         call    __KeccakF1600
 379
 380         notq    $A[0][1](%rdi)
 381         notq    $A[0][2](%rdi)
 382         notq    $A[1][3](%rdi)
 383         notq    $A[2][2](%rdi)
 384         notq    $A[3][2](%rdi)
 385         notq    $A[4][0](%rdi)
 386         lea     -100(%rdi),%rdi         # preserve A[][]
 387
 388         add     \$200,%rsp
 389 .cfi_adjust_cfa_offset  -200
 390
 391         pop     %r15
 392 .cfi_pop        %r15
 393         pop     %r14
 394 .cfi_pop        %r14
 395         pop     %r13
 396 .cfi_pop        %r13
 397         pop     %r12
 398 .cfi_pop        %r12
 399         pop     %rbp
 400 .cfi_pop        %rbp
 401         pop     %rbx
 402 .cfi_pop        %rbx
 403         ret
 404 .cfi_endproc
 405 .size   KeccakF1600,.-KeccakF1600
 406 ___
 407
 408 { my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
 409      ($A_flat,$inp) = ("%r8","%r9");
 410 $code.=<<___;
 411 .globl  SHA3_absorb
 412 .type   SHA3_absorb,\@function,4
 413 .align  32
 414 SHA3_absorb:
 415 .cfi_startproc
 416         push    %rbx
 417 .cfi_push       %rbx
 418         push    %rbp
 419 .cfi_push       %rbp
 420         push    %r12
 421 .cfi_push       %r12
 422         push    %r13
 423 .cfi_push       %r13
 424         push    %r14
 425 .cfi_push       %r14
 426         push    %r15
 427 .cfi_push       %r15
 428
 429         lea     100(%rdi),%rdi          # size optimization
 430         sub     \$232,%rsp
 431 .cfi_adjust_cfa_offset  232
 432
 433         mov     %rsi,$inp
 434         lea     100(%rsp),%rsi          # size optimization
 435
 436         notq    $A[0][1](%rdi)
 437         notq    $A[0][2](%rdi)
 438         notq    $A[1][3](%rdi)
 439         notq    $A[2][2](%rdi)
 440         notq    $A[3][2](%rdi)
 441         notq    $A[4][0](%rdi)
 442         lea     iotas(%rip),$iotas
 443
 444         mov     $bsz,216-100(%rsi)      # save bsz
 445
 446 .Loop_absorb:
 447         cmp     $bsz,$len
 448         jc      .Ldone_absorb
 449
 450         shr     \$3,$bsz
 451         lea     -100(%rdi),$A_flat
 452
 453 .Lblock_absorb:
 454         mov     ($inp),%rax
 455         lea     8($inp),$inp
 456         xor     ($A_flat),%rax
 457         lea     8($A_flat),$A_flat
 458         sub     \$8,$len
 459         mov     %rax,-8($A_flat)
 460         sub     \$1,$bsz
 461         jnz     .Lblock_absorb
 462
 463         mov     $inp,200-100(%rsi)      # save inp
 464         mov     $len,208-100(%rsi)      # save len
 465         call    __KeccakF1600
 466         mov     200-100(%rsi),$inp      # pull inp
 467         mov     208-100(%rsi),$len      # pull len
 468         mov     216-100(%rsi),$bsz      # pull bsz
 469         jmp     .Loop_absorb
 470
 471 .align  32
 472 .Ldone_absorb:
 473         mov     $len,%rax               # return value
 474
 475         notq    $A[0][1](%rdi)
 476         notq    $A[0][2](%rdi)
 477         notq    $A[1][3](%rdi)
 478         notq    $A[2][2](%rdi)
 479         notq    $A[3][2](%rdi)
 480         notq    $A[4][0](%rdi)
 481
 482         add     \$232,%rsp
 483 .cfi_adjust_cfa_offset  -232
 484
 485         pop     %r15
 486 .cfi_pop        %r15
 487         pop     %r14
 488 .cfi_pop        %r14
 489         pop     %r13
 490 .cfi_pop        %r13
 491         pop     %r12
 492 .cfi_pop        %r12
 493         pop     %rbp
 494 .cfi_pop        %rbp
 495         pop     %rbx
 496 .cfi_pop        %rbx
 497         ret
 498 .cfi_endproc
 499 .size   SHA3_absorb,.-SHA3_absorb
 500 ___
 501 }
 502 { my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
 503      ($out,$len,$bsz) = ("%r12","%r13","%r14");
 504
 505 $code.=<<___;
 506 .globl  SHA3_squeeze
 507 .type   SHA3_squeeze,\@function,4
 508 .align  32
 509 SHA3_squeeze:
 510 .cfi_startproc
 511         push    %r12
 512 .cfi_push       %r12
 513         push    %r13
 514 .cfi_push       %r13
 515         push    %r14
 516 .cfi_push       %r14
 517
 518         shr     \$3,%rcx
 519         mov     $A_flat,%r8
 520         mov     %rsi,$out
 521         mov     %rdx,$len
 522         mov     %rcx,$bsz
 523         jmp     .Loop_squeeze
 524
 525 .align  32
 526 .Loop_squeeze:
 527         cmp     \$8,$len
 528         jb      .Ltail_squeeze
 529
 530         mov     (%r8),%rax
 531         lea     8(%r8),%r8
 532         mov     %rax,($out)
 533         lea     8($out),$out
 534         sub     \$8,$len                # len -= 8
 535         jz      .Ldone_squeeze
 536
 537         sub     \$1,%rcx                # bsz--
 538         jnz     .Loop_squeeze
 539
 540         call    KeccakF1600
 541         mov     $A_flat,%r8
 542         mov     $bsz,%rcx
 543         jmp     .Loop_squeeze
 544
 545 .Ltail_squeeze:
 546         mov     %r8, %rsi
 547         mov     $out,%rdi
 548         mov     $len,%rcx
 549         .byte   0xf3,0xa4               # rep   movsb
 550
 551 .Ldone_squeeze:
 552         pop     %r14
 553 .cfi_pop        %r14
 554         pop     %r13
 555 .cfi_pop        %r13
 556         pop     %r12
 557 .cfi_pop        %r13
 558         ret
 559 .cfi_endproc
 560 .size   SHA3_squeeze,.-SHA3_squeeze
 561 ___
 562 }
 563 $code.=<<___;
 564 .align  256
 565         .quad   0,0,0,0,0,0,0,0
 566 .type   iotas,\@object
 567 iotas:
 568         .quad   0x0000000000000001
 569         .quad   0x0000000000008082
 570         .quad   0x800000000000808a
 571         .quad   0x8000000080008000
 572         .quad   0x000000000000808b
 573         .quad   0x0000000080000001
 574         .quad   0x8000000080008081
 575         .quad   0x8000000000008009
 576         .quad   0x000000000000008a
 577         .quad   0x0000000000000088
 578         .quad   0x0000000080008009
 579         .quad   0x000000008000000a
 580         .quad   0x000000008000808b
 581         .quad   0x800000000000008b
 582         .quad   0x8000000000008089
 583         .quad   0x8000000000008003
 584         .quad   0x8000000000008002
 585         .quad   0x8000000000000080
 586         .quad   0x000000000000800a
 587         .quad   0x800000008000000a
 588         .quad   0x8000000080008081
 589         .quad   0x8000000000008080
 590         .quad   0x0000000080000001
 591         .quad   0x8000000080008008
 592 .size   iotas,.-iotas
 593 .asciz  "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 594 ___
 595
 596 foreach (split("\n",$code)) {
 597         # Below replacement results in 11.2 on Sandy Bridge, 9.4 on
 598         # Haswell, but it hurts other processors by up to 2-3-4x...
 599         #s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
 600         # Below replacement results in 9.3 on Haswell [as well as
 601         # on Ryzen, i.e. it *hurts* Ryzen]...
 602         #s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/;
 603
 604         print $_, "\n";
 605 }
 606
 607 close STDOUT;