crypto/aes/asm/aesni-sha256-x86_64.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 # January 2013
  18 #
  19 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
  20 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  21 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  22 # parallelism, interleaving it with another algorithm would allow to
  23 # utilize processor resources better and achieve better performance.
  24 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
  25 # AESNI code is weaved into it. As SHA256 dominates execution time,
  26 # stitch performance does not depend on AES key length. Below are
  27 # performance numbers in cycles per processed byte, less is better,
  28 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
  29 # subroutine:
  30 #
  31 #                AES-128/-192/-256+SHA256       this(**)gain
  32 # Sandy Bridge      5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
  33 # Ivy Bridge        5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
  34 # Haswell           4.43/5.29/6.19+7.80         8.79    +39%/49%/59%
  35 # Skylake           2.62/3.14/3.62+7.70         8.10    +27%/34%/40%
  36 # Bulldozer         5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
  37 #
  38 # (*)   there are XOP, AVX1 and AVX2 code pathes, meaning that
  39 #       Westmere is omitted from loop, this is because gain was not
  40 #       estimated high enough to justify the effort;
  41 # (**)  these are EVP-free results, results obtained with 'speed
  42 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
  43
  44 $flavour = shift;
  45 $output  = shift;
  46 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  47
  48 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  49
  50 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  51 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  52 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  53 die "can't locate x86_64-xlate.pl";
  54
  55 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  56                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  57         $avx = ($1>=2.19) + ($1>=2.22);
  58 }
  59
  60 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  61            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  62         $avx = ($1>=2.09) + ($1>=2.10);
  63 }
  64
  65 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  66            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  67         $avx = ($1>=10) + ($1>=12);
  68 }
  69
  70 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
  71         $avx = ($2>=3.0) + ($2>3.0);
  72 }
  73
  74 $shaext=$avx;   ### set to zero if compiling for 1.0.1
  75 $avx=1          if (!$shaext && $avx);
  76
  77 open OUT,"| \"$^X\" $xlate $flavour $output";
  78 *STDOUT=*OUT;
  79
  80 $func="aesni_cbc_sha256_enc";
  81 $TABLE="K256";
  82 $SZ=4;
  83 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  84                                 "%r8d","%r9d","%r10d","%r11d");
  85 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
  86 @Sigma0=( 2,13,22);
  87 @Sigma1=( 6,11,25);
  88 @sigma0=( 7,18, 3);
  89 @sigma1=(17,19,10);
  90 $rounds=64;
  91
  92 ########################################################################
  93 # void aesni_cbc_sha256_enc(const void *inp,
  94 #                       void *out,
  95 #                       size_t length,
  96 #                       const AES_KEY *key,
  97 #                       unsigned char *iv,
  98 #                       SHA256_CTX *ctx,
  99 #                       const void *in0);
 100 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
 101 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 102
 103 $Tbl="%rbp";
 104
 105 $_inp="16*$SZ+0*8(%rsp)";
 106 $_out="16*$SZ+1*8(%rsp)";
 107 $_end="16*$SZ+2*8(%rsp)";
 108 $_key="16*$SZ+3*8(%rsp)";
 109 $_ivp="16*$SZ+4*8(%rsp)";
 110 $_ctx="16*$SZ+5*8(%rsp)";
 111 $_in0="16*$SZ+6*8(%rsp)";
 112 $_rsp="16*$SZ+7*8(%rsp)";
 113 $framesz=16*$SZ+8*8;
 114
 115 $code=<<___;
 116 .text
 117
 118 .extern OPENSSL_ia32cap_P
 119 .globl  $func
 120 .type   $func,\@abi-omnipotent
 121 .align  16
 122 $func:
 123 ___
 124                                                 if ($avx) {
 125 $code.=<<___;
 126         lea     OPENSSL_ia32cap_P(%rip),%r11
 127         mov     \$1,%eax
 128         cmp     \$0,`$win64?"%rcx":"%rdi"`
 129         je      .Lprobe
 130         mov     0(%r11),%eax
 131         mov     4(%r11),%r10
 132 ___
 133 $code.=<<___ if ($shaext);
 134         bt      \$61,%r10                       # check for SHA
 135         jc      ${func}_shaext
 136 ___
 137 $code.=<<___;
 138         mov     %r10,%r11
 139         shr     \$32,%r11
 140
 141         test    \$`1<<11`,%r10d                 # check for XOP
 142         jnz     ${func}_xop
 143 ___
 144 $code.=<<___ if ($avx>1);
 145         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
 146         cmp     \$`1<<8|1<<5|1<<3`,%r11d
 147         je      ${func}_avx2
 148 ___
 149 $code.=<<___;
 150         and     \$`1<<28`,%r10d                 # check for AVX
 151         jnz     ${func}_avx
 152         ud2
 153 ___
 154                                                 }
 155 $code.=<<___;
 156         xor     %eax,%eax
 157         cmp     \$0,`$win64?"%rcx":"%rdi"`
 158         je      .Lprobe
 159         ud2
 160 .Lprobe:
 161         ret
 162 .size   $func,.-$func
 163
 164 .align  64
 165 .type   $TABLE,\@object
 166 $TABLE:
 167         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 168         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 169         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 170         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 171         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 172         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 173         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 174         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 175         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 176         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 177         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 178         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 179         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 180         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 181         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 182         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 183         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 184         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 185         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 186         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 187         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 188         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 189         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 190         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 191         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 192         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 193         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 194         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 195         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 196         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 197         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 198         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 199
 200         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 201         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 202         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
 203         .long   0,0,0,0,   0,0,0,0
 204         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 205 .align  64
 206 ___
 207
 208 ######################################################################
 209 # SIMD code paths
 210 #
 211 {{{
 212 ($iv,$inout,$roundkey,$temp,
 213  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
 214
 215 $aesni_cbc_idx=0;
 216 @aesni_cbc_block = (
 217 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
 218 ##      &vmovdqu        ($inout,($inp));
 219 ##      &mov            ($_inp,$inp);
 220
 221         '&vpxor         ($inout,$inout,$roundkey);'.
 222         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
 223
 224         '&vpxor         ($inout,$inout,$iv);',
 225
 226         '&vaesenc       ($inout,$inout,$roundkey);'.
 227         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
 228
 229         '&vaesenc       ($inout,$inout,$roundkey);'.
 230         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
 231
 232         '&vaesenc       ($inout,$inout,$roundkey);'.
 233         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
 234
 235         '&vaesenc       ($inout,$inout,$roundkey);'.
 236         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
 237
 238         '&vaesenc       ($inout,$inout,$roundkey);'.
 239         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
 240
 241         '&vaesenc       ($inout,$inout,$roundkey);'.
 242         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
 243
 244         '&vaesenc       ($inout,$inout,$roundkey);'.
 245         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
 246
 247         '&vaesenc       ($inout,$inout,$roundkey);'.
 248         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
 249
 250         '&vaesenc       ($inout,$inout,$roundkey);'.
 251         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
 252
 253         '&vaesenclast   ($temp,$inout,$roundkey);'.
 254         ' &vaesenc      ($inout,$inout,$roundkey);'.
 255         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
 256
 257         '&vpand         ($iv,$temp,$mask10);'.
 258         ' &vaesenc      ($inout,$inout,$roundkey);'.
 259         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
 260
 261         '&vaesenclast   ($temp,$inout,$roundkey);'.
 262         ' &vaesenc      ($inout,$inout,$roundkey);'.
 263         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
 264
 265         '&vpand         ($temp,$temp,$mask12);'.
 266         ' &vaesenc      ($inout,$inout,$roundkey);'.
 267          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
 268
 269         '&vpor          ($iv,$iv,$temp);'.
 270         ' &vaesenclast  ($temp,$inout,$roundkey);'.
 271         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
 272
 273 ##      &mov            ($inp,$_inp);
 274 ##      &mov            ($out,$_out);
 275 ##      &vpand          ($temp,$temp,$mask14);
 276 ##      &vpor           ($iv,$iv,$temp);
 277 ##      &vmovdqu        ($iv,($out,$inp);
 278 ##      &lea            (inp,16($inp));
 279 );
 280
 281 my $a4=$T1;
 282 my ($a,$b,$c,$d,$e,$f,$g,$h);
 283
 284 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
 285 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
 286   my $arg = pop;
 287     $arg = "\$$arg" if ($arg*1 eq $arg);
 288     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 289 }
 290
 291 sub body_00_15 () {
 292         (
 293         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 294
 295         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
 296         '&mov   ($a,$a1)',
 297         '&mov   ($a4,$f)',
 298
 299         '&xor   ($a0,$e)',
 300         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
 301         '&xor   ($a4,$g)',                      # f^g
 302
 303         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
 304         '&xor   ($a1,$a)',
 305         '&and   ($a4,$e)',                      # (f^g)&e
 306
 307         @aesni_cbc_block[$aesni_cbc_idx++].
 308         '&xor   ($a0,$e)',
 309         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
 310         '&mov   ($a2,$a)',
 311
 312         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
 313         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
 314         '&xor   ($a2,$b)',                      # a^b, b^c in next round
 315
 316         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
 317         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
 318         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
 319
 320         '&xor   ($a1,$a)',
 321         '&add   ($h,$a0)',                      # h+=Sigma1(e)
 322         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
 323
 324         '&add   ($d,$h)',                       # d+=h
 325         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
 326         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
 327
 328         '&mov   ($a0,$d)',
 329         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
 330         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 331         );
 332 }
 333
 334 if ($avx) {{
 335 ######################################################################
 336 # XOP code path
 337 #
 338 $code.=<<___;
 339 .type   ${func}_xop,\@function,6
 340 .align  64
 341 ${func}_xop:
 342 .Lxop_shortcut:
 343         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 344         push    %rbx
 345         push    %rbp
 346         push    %r12
 347         push    %r13
 348         push    %r14
 349         push    %r15
 350         mov     %rsp,%r11               # copy %rsp
 351         sub     \$`$framesz+$win64*16*10`,%rsp
 352         and     \$-64,%rsp              # align stack frame
 353
 354         shl     \$6,$len
 355         sub     $inp,$out               # re-bias
 356         sub     $inp,$in0
 357         add     $inp,$len               # end of input
 358
 359         #mov    $inp,$_inp              # saved later
 360         mov     $out,$_out
 361         mov     $len,$_end
 362         #mov    $key,$_key              # remains resident in $inp register
 363         mov     $ivp,$_ivp
 364         mov     $ctx,$_ctx
 365         mov     $in0,$_in0
 366         mov     %r11,$_rsp
 367 ___
 368 $code.=<<___ if ($win64);
 369         movaps  %xmm6,`$framesz+16*0`(%rsp)
 370         movaps  %xmm7,`$framesz+16*1`(%rsp)
 371         movaps  %xmm8,`$framesz+16*2`(%rsp)
 372         movaps  %xmm9,`$framesz+16*3`(%rsp)
 373         movaps  %xmm10,`$framesz+16*4`(%rsp)
 374         movaps  %xmm11,`$framesz+16*5`(%rsp)
 375         movaps  %xmm12,`$framesz+16*6`(%rsp)
 376         movaps  %xmm13,`$framesz+16*7`(%rsp)
 377         movaps  %xmm14,`$framesz+16*8`(%rsp)
 378         movaps  %xmm15,`$framesz+16*9`(%rsp)
 379 ___
 380 $code.=<<___;
 381 .Lprologue_xop:
 382         vzeroall
 383
 384         mov     $inp,%r12               # borrow $a4
 385         lea     0x80($key),$inp         # size optimization, reassign
 386         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 387         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 388         mov     $ctx,%r15               # borrow $a2
 389         mov     $in0,%rsi               # borrow $a3
 390         vmovdqu ($ivp),$iv              # load IV
 391         sub     \$9,%r14
 392
 393         mov     $SZ*0(%r15),$A
 394         mov     $SZ*1(%r15),$B
 395         mov     $SZ*2(%r15),$C
 396         mov     $SZ*3(%r15),$D
 397         mov     $SZ*4(%r15),$E
 398         mov     $SZ*5(%r15),$F
 399         mov     $SZ*6(%r15),$G
 400         mov     $SZ*7(%r15),$H
 401
 402         vmovdqa 0x00(%r13,%r14,8),$mask14
 403         vmovdqa 0x10(%r13,%r14,8),$mask12
 404         vmovdqa 0x20(%r13,%r14,8),$mask10
 405         vmovdqu 0x00-0x80($inp),$roundkey
 406         jmp     .Lloop_xop
 407 ___
 408                                         if ($SZ==4) {   # SHA256
 409     my @X = map("%xmm$_",(0..3));
 410     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 411
 412 $code.=<<___;
 413 .align  16
 414 .Lloop_xop:
 415         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 416         vmovdqu 0x00(%rsi,%r12),@X[0]
 417         vmovdqu 0x10(%rsi,%r12),@X[1]
 418         vmovdqu 0x20(%rsi,%r12),@X[2]
 419         vmovdqu 0x30(%rsi,%r12),@X[3]
 420         vpshufb $t3,@X[0],@X[0]
 421         lea     $TABLE(%rip),$Tbl
 422         vpshufb $t3,@X[1],@X[1]
 423         vpshufb $t3,@X[2],@X[2]
 424         vpaddd  0x00($Tbl),@X[0],$t0
 425         vpshufb $t3,@X[3],@X[3]
 426         vpaddd  0x20($Tbl),@X[1],$t1
 427         vpaddd  0x40($Tbl),@X[2],$t2
 428         vpaddd  0x60($Tbl),@X[3],$t3
 429         vmovdqa $t0,0x00(%rsp)
 430         mov     $A,$a1
 431         vmovdqa $t1,0x10(%rsp)
 432         mov     $B,$a3
 433         vmovdqa $t2,0x20(%rsp)
 434         xor     $C,$a3                  # magic
 435         vmovdqa $t3,0x30(%rsp)
 436         mov     $E,$a0
 437         jmp     .Lxop_00_47
 438
 439 .align  16
 440 .Lxop_00_47:
 441         sub     \$-16*2*$SZ,$Tbl        # size optimization
 442         vmovdqu (%r12),$inout           # $a4
 443         mov     %r12,$_inp              # $a4
 444 ___
 445 sub XOP_256_00_47 () {
 446 my $j = shift;
 447 my $body = shift;
 448 my @X = @_;
 449 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 450
 451         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
 452           eval(shift(@insns));
 453           eval(shift(@insns));
 454          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
 455           eval(shift(@insns));
 456           eval(shift(@insns));
 457         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
 458           eval(shift(@insns));
 459           eval(shift(@insns));
 460         &vpsrld         ($t0,$t0,$sigma0[2]);
 461           eval(shift(@insns));
 462           eval(shift(@insns));
 463          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
 464           eval(shift(@insns));
 465           eval(shift(@insns));
 466           eval(shift(@insns));
 467           eval(shift(@insns));
 468         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
 469           eval(shift(@insns));
 470           eval(shift(@insns));
 471         &vpxor          ($t0,$t0,$t1);
 472           eval(shift(@insns));
 473           eval(shift(@insns));
 474           eval(shift(@insns));
 475           eval(shift(@insns));
 476          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
 477           eval(shift(@insns));
 478           eval(shift(@insns));
 479         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
 480           eval(shift(@insns));
 481           eval(shift(@insns));
 482          &vpsrld        ($t2,@X[3],$sigma1[2]);
 483           eval(shift(@insns));
 484           eval(shift(@insns));
 485         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
 486           eval(shift(@insns));
 487           eval(shift(@insns));
 488          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 489           eval(shift(@insns));
 490           eval(shift(@insns));
 491          &vpxor         ($t3,$t3,$t2);
 492           eval(shift(@insns));
 493           eval(shift(@insns));
 494           eval(shift(@insns));
 495           eval(shift(@insns));
 496          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
 497           eval(shift(@insns));
 498           eval(shift(@insns));
 499           eval(shift(@insns));
 500           eval(shift(@insns));
 501         &vpsrldq        ($t3,$t3,8);
 502           eval(shift(@insns));
 503           eval(shift(@insns));
 504           eval(shift(@insns));
 505           eval(shift(@insns));
 506         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
 507           eval(shift(@insns));
 508           eval(shift(@insns));
 509           eval(shift(@insns));
 510           eval(shift(@insns));
 511          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
 512           eval(shift(@insns));
 513           eval(shift(@insns));
 514          &vpsrld        ($t2,@X[0],$sigma1[2]);
 515           eval(shift(@insns));
 516           eval(shift(@insns));
 517          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 518           eval(shift(@insns));
 519           eval(shift(@insns));
 520          &vpxor         ($t3,$t3,$t2);
 521           eval(shift(@insns));
 522           eval(shift(@insns));
 523           eval(shift(@insns));
 524           eval(shift(@insns));
 525          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
 526           eval(shift(@insns));
 527           eval(shift(@insns));
 528           eval(shift(@insns));
 529           eval(shift(@insns));
 530         &vpslldq        ($t3,$t3,8);            # 22 instructions
 531           eval(shift(@insns));
 532           eval(shift(@insns));
 533           eval(shift(@insns));
 534           eval(shift(@insns));
 535         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
 536           eval(shift(@insns));
 537           eval(shift(@insns));
 538           eval(shift(@insns));
 539           eval(shift(@insns));
 540         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 541           foreach (@insns) { eval; }            # remaining instructions
 542         &vmovdqa        (16*$j."(%rsp)",$t2);
 543 }
 544
 545     $aesni_cbc_idx=0;
 546     for ($i=0,$j=0; $j<4; $j++) {
 547         &XOP_256_00_47($j,\&body_00_15,@X);
 548         push(@X,shift(@X));                     # rotate(@X)
 549     }
 550         &mov            ("%r12",$_inp);         # borrow $a4
 551         &vpand          ($temp,$temp,$mask14);
 552         &mov            ("%r15",$_out);         # borrow $a2
 553         &vpor           ($iv,$iv,$temp);
 554         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 555         &lea            ("%r12","16(%r12)");    # inp++
 556
 557         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 558         &jne    (".Lxop_00_47");
 559
 560         &vmovdqu        ($inout,"(%r12)");
 561         &mov            ($_inp,"%r12");
 562
 563     $aesni_cbc_idx=0;
 564     for ($i=0; $i<16; ) {
 565         foreach(body_00_15()) { eval; }
 566     }
 567                                         }
 568 $code.=<<___;
 569         mov     $_inp,%r12              # borrow $a4
 570         mov     $_out,%r13              # borrow $a0
 571         mov     $_ctx,%r15              # borrow $a2
 572         mov     $_in0,%rsi              # borrow $a3
 573
 574         vpand   $mask14,$temp,$temp
 575         mov     $a1,$A
 576         vpor    $temp,$iv,$iv
 577         vmovdqu $iv,(%r13,%r12)         # write output
 578         lea     16(%r12),%r12           # inp++
 579
 580         add     $SZ*0(%r15),$A
 581         add     $SZ*1(%r15),$B
 582         add     $SZ*2(%r15),$C
 583         add     $SZ*3(%r15),$D
 584         add     $SZ*4(%r15),$E
 585         add     $SZ*5(%r15),$F
 586         add     $SZ*6(%r15),$G
 587         add     $SZ*7(%r15),$H
 588
 589         cmp     $_end,%r12
 590
 591         mov     $A,$SZ*0(%r15)
 592         mov     $B,$SZ*1(%r15)
 593         mov     $C,$SZ*2(%r15)
 594         mov     $D,$SZ*3(%r15)
 595         mov     $E,$SZ*4(%r15)
 596         mov     $F,$SZ*5(%r15)
 597         mov     $G,$SZ*6(%r15)
 598         mov     $H,$SZ*7(%r15)
 599
 600         jb      .Lloop_xop
 601
 602         mov     $_ivp,$ivp
 603         mov     $_rsp,%rsi
 604         vmovdqu $iv,($ivp)              # output IV
 605         vzeroall
 606 ___
 607 $code.=<<___ if ($win64);
 608         movaps  `$framesz+16*0`(%rsp),%xmm6
 609         movaps  `$framesz+16*1`(%rsp),%xmm7
 610         movaps  `$framesz+16*2`(%rsp),%xmm8
 611         movaps  `$framesz+16*3`(%rsp),%xmm9
 612         movaps  `$framesz+16*4`(%rsp),%xmm10
 613         movaps  `$framesz+16*5`(%rsp),%xmm11
 614         movaps  `$framesz+16*6`(%rsp),%xmm12
 615         movaps  `$framesz+16*7`(%rsp),%xmm13
 616         movaps  `$framesz+16*8`(%rsp),%xmm14
 617         movaps  `$framesz+16*9`(%rsp),%xmm15
 618 ___
 619 $code.=<<___;
 620         mov     (%rsi),%r15
 621         mov     8(%rsi),%r14
 622         mov     16(%rsi),%r13
 623         mov     24(%rsi),%r12
 624         mov     32(%rsi),%rbp
 625         mov     40(%rsi),%rbx
 626         lea     48(%rsi),%rsp
 627 .Lepilogue_xop:
 628         ret
 629 .size   ${func}_xop,.-${func}_xop
 630 ___
 631 ######################################################################
 632 # AVX+shrd code path
 633 #
 634 local *ror = sub { &shrd(@_[0],@_) };
 635
 636 $code.=<<___;
 637 .type   ${func}_avx,\@function,6
 638 .align  64
 639 ${func}_avx:
 640 .Lavx_shortcut:
 641         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 642         push    %rbx
 643         push    %rbp
 644         push    %r12
 645         push    %r13
 646         push    %r14
 647         push    %r15
 648         mov     %rsp,%r11               # copy %rsp
 649         sub     \$`$framesz+$win64*16*10`,%rsp
 650         and     \$-64,%rsp              # align stack frame
 651
 652         shl     \$6,$len
 653         sub     $inp,$out               # re-bias
 654         sub     $inp,$in0
 655         add     $inp,$len               # end of input
 656
 657         #mov    $inp,$_inp              # saved later
 658         mov     $out,$_out
 659         mov     $len,$_end
 660         #mov    $key,$_key              # remains resident in $inp register
 661         mov     $ivp,$_ivp
 662         mov     $ctx,$_ctx
 663         mov     $in0,$_in0
 664         mov     %r11,$_rsp
 665 ___
 666 $code.=<<___ if ($win64);
 667         movaps  %xmm6,`$framesz+16*0`(%rsp)
 668         movaps  %xmm7,`$framesz+16*1`(%rsp)
 669         movaps  %xmm8,`$framesz+16*2`(%rsp)
 670         movaps  %xmm9,`$framesz+16*3`(%rsp)
 671         movaps  %xmm10,`$framesz+16*4`(%rsp)
 672         movaps  %xmm11,`$framesz+16*5`(%rsp)
 673         movaps  %xmm12,`$framesz+16*6`(%rsp)
 674         movaps  %xmm13,`$framesz+16*7`(%rsp)
 675         movaps  %xmm14,`$framesz+16*8`(%rsp)
 676         movaps  %xmm15,`$framesz+16*9`(%rsp)
 677 ___
 678 $code.=<<___;
 679 .Lprologue_avx:
 680         vzeroall
 681
 682         mov     $inp,%r12               # borrow $a4
 683         lea     0x80($key),$inp         # size optimization, reassign
 684         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 685         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 686         mov     $ctx,%r15               # borrow $a2
 687         mov     $in0,%rsi               # borrow $a3
 688         vmovdqu ($ivp),$iv              # load IV
 689         sub     \$9,%r14
 690
 691         mov     $SZ*0(%r15),$A
 692         mov     $SZ*1(%r15),$B
 693         mov     $SZ*2(%r15),$C
 694         mov     $SZ*3(%r15),$D
 695         mov     $SZ*4(%r15),$E
 696         mov     $SZ*5(%r15),$F
 697         mov     $SZ*6(%r15),$G
 698         mov     $SZ*7(%r15),$H
 699
 700         vmovdqa 0x00(%r13,%r14,8),$mask14
 701         vmovdqa 0x10(%r13,%r14,8),$mask12
 702         vmovdqa 0x20(%r13,%r14,8),$mask10
 703         vmovdqu 0x00-0x80($inp),$roundkey
 704 ___
 705                                         if ($SZ==4) {   # SHA256
 706     my @X = map("%xmm$_",(0..3));
 707     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 708
 709 $code.=<<___;
 710         jmp     .Lloop_avx
 711 .align  16
 712 .Lloop_avx:
 713         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 714         vmovdqu 0x00(%rsi,%r12),@X[0]
 715         vmovdqu 0x10(%rsi,%r12),@X[1]
 716         vmovdqu 0x20(%rsi,%r12),@X[2]
 717         vmovdqu 0x30(%rsi,%r12),@X[3]
 718         vpshufb $t3,@X[0],@X[0]
 719         lea     $TABLE(%rip),$Tbl
 720         vpshufb $t3,@X[1],@X[1]
 721         vpshufb $t3,@X[2],@X[2]
 722         vpaddd  0x00($Tbl),@X[0],$t0
 723         vpshufb $t3,@X[3],@X[3]
 724         vpaddd  0x20($Tbl),@X[1],$t1
 725         vpaddd  0x40($Tbl),@X[2],$t2
 726         vpaddd  0x60($Tbl),@X[3],$t3
 727         vmovdqa $t0,0x00(%rsp)
 728         mov     $A,$a1
 729         vmovdqa $t1,0x10(%rsp)
 730         mov     $B,$a3
 731         vmovdqa $t2,0x20(%rsp)
 732         xor     $C,$a3                  # magic
 733         vmovdqa $t3,0x30(%rsp)
 734         mov     $E,$a0
 735         jmp     .Lavx_00_47
 736
 737 .align  16
 738 .Lavx_00_47:
 739         sub     \$-16*2*$SZ,$Tbl        # size optimization
 740         vmovdqu (%r12),$inout           # $a4
 741         mov     %r12,$_inp              # $a4
 742 ___
 743 sub Xupdate_256_AVX () {
 744         (
 745         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
 746          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
 747         '&vpsrld        ($t2,$t0,$sigma0[0]);',
 748          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
 749         '&vpsrld        ($t3,$t0,$sigma0[2])',
 750         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
 751         '&vpxor         ($t0,$t3,$t2)',
 752          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
 753         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
 754         '&vpxor         ($t0,$t0,$t1)',
 755         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
 756         '&vpxor         ($t0,$t0,$t2)',
 757          '&vpsrld       ($t2,$t3,$sigma1[2]);',
 758         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
 759          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
 760         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
 761          '&vpxor        ($t2,$t2,$t3);',
 762          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 763          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
 764          '&vpshufd      ($t2,$t2,0b10000100)',
 765          '&vpsrldq      ($t2,$t2,8)',
 766         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
 767          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
 768          '&vpsrld       ($t2,$t3,$sigma1[2])',
 769          '&vpsrlq       ($t3,$t3,$sigma1[0])',
 770          '&vpxor        ($t2,$t2,$t3);',
 771          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 772          '&vpxor        ($t2,$t2,$t3)',
 773          '&vpshufd      ($t2,$t2,0b11101000)',
 774          '&vpslldq      ($t2,$t2,8)',
 775         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
 776         );
 777 }
 778
 779 sub AVX_256_00_47 () {
 780 my $j = shift;
 781 my $body = shift;
 782 my @X = @_;
 783 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 784
 785         foreach (Xupdate_256_AVX()) {           # 29 instructions
 786             eval;
 787             eval(shift(@insns));
 788             eval(shift(@insns));
 789             eval(shift(@insns));
 790         }
 791         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 792           foreach (@insns) { eval; }            # remaining instructions
 793         &vmovdqa        (16*$j."(%rsp)",$t2);
 794 }
 795
 796     $aesni_cbc_idx=0;
 797     for ($i=0,$j=0; $j<4; $j++) {
 798         &AVX_256_00_47($j,\&body_00_15,@X);
 799         push(@X,shift(@X));                     # rotate(@X)
 800     }
 801         &mov            ("%r12",$_inp);         # borrow $a4
 802         &vpand          ($temp,$temp,$mask14);
 803         &mov            ("%r15",$_out);         # borrow $a2
 804         &vpor           ($iv,$iv,$temp);
 805         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 806         &lea            ("%r12","16(%r12)");    # inp++
 807
 808         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 809         &jne    (".Lavx_00_47");
 810
 811         &vmovdqu        ($inout,"(%r12)");
 812         &mov            ($_inp,"%r12");
 813
 814     $aesni_cbc_idx=0;
 815     for ($i=0; $i<16; ) {
 816         foreach(body_00_15()) { eval; }
 817     }
 818
 819                                         }
 820 $code.=<<___;
 821         mov     $_inp,%r12              # borrow $a4
 822         mov     $_out,%r13              # borrow $a0
 823         mov     $_ctx,%r15              # borrow $a2
 824         mov     $_in0,%rsi              # borrow $a3
 825
 826         vpand   $mask14,$temp,$temp
 827         mov     $a1,$A
 828         vpor    $temp,$iv,$iv
 829         vmovdqu $iv,(%r13,%r12)         # write output
 830         lea     16(%r12),%r12           # inp++
 831
 832         add     $SZ*0(%r15),$A
 833         add     $SZ*1(%r15),$B
 834         add     $SZ*2(%r15),$C
 835         add     $SZ*3(%r15),$D
 836         add     $SZ*4(%r15),$E
 837         add     $SZ*5(%r15),$F
 838         add     $SZ*6(%r15),$G
 839         add     $SZ*7(%r15),$H
 840
 841         cmp     $_end,%r12
 842
 843         mov     $A,$SZ*0(%r15)
 844         mov     $B,$SZ*1(%r15)
 845         mov     $C,$SZ*2(%r15)
 846         mov     $D,$SZ*3(%r15)
 847         mov     $E,$SZ*4(%r15)
 848         mov     $F,$SZ*5(%r15)
 849         mov     $G,$SZ*6(%r15)
 850         mov     $H,$SZ*7(%r15)
 851         jb      .Lloop_avx
 852
 853         mov     $_ivp,$ivp
 854         mov     $_rsp,%rsi
 855         vmovdqu $iv,($ivp)              # output IV
 856         vzeroall
 857 ___
 858 $code.=<<___ if ($win64);
 859         movaps  `$framesz+16*0`(%rsp),%xmm6
 860         movaps  `$framesz+16*1`(%rsp),%xmm7
 861         movaps  `$framesz+16*2`(%rsp),%xmm8
 862         movaps  `$framesz+16*3`(%rsp),%xmm9
 863         movaps  `$framesz+16*4`(%rsp),%xmm10
 864         movaps  `$framesz+16*5`(%rsp),%xmm11
 865         movaps  `$framesz+16*6`(%rsp),%xmm12
 866         movaps  `$framesz+16*7`(%rsp),%xmm13
 867         movaps  `$framesz+16*8`(%rsp),%xmm14
 868         movaps  `$framesz+16*9`(%rsp),%xmm15
 869 ___
 870 $code.=<<___;
 871         mov     (%rsi),%r15
 872         mov     8(%rsi),%r14
 873         mov     16(%rsi),%r13
 874         mov     24(%rsi),%r12
 875         mov     32(%rsi),%rbp
 876         mov     40(%rsi),%rbx
 877         lea     48(%rsi),%rsp
 878 .Lepilogue_avx:
 879         ret
 880 .size   ${func}_avx,.-${func}_avx
 881 ___
 882
 883 if ($avx>1) {{
 884 ######################################################################
 885 # AVX2+BMI code path
 886 #
 887 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp
 888 my $PUSH8=8*2*$SZ;
 889 use integer;
 890
 891 sub bodyx_00_15 () {
 892         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
 893         (
 894         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 895
 896         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
 897         '&and   ($a4,$e)',              # f&e
 898         '&rorx  ($a0,$e,$Sigma1[2])',
 899         '&rorx  ($a2,$e,$Sigma1[1])',
 900
 901         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
 902         '&lea   ($h,"($h,$a4)")',
 903         '&andn  ($a4,$e,$g)',           # ~e&g
 904         '&xor   ($a0,$a2)',
 905
 906         '&rorx  ($a1,$e,$Sigma1[0])',
 907         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
 908         '&xor   ($a0,$a1)',             # Sigma1(e)
 909         '&mov   ($a2,$a)',
 910
 911         '&rorx  ($a4,$a,$Sigma0[2])',
 912         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
 913         '&xor   ($a2,$b)',              # a^b, b^c in next round
 914         '&rorx  ($a1,$a,$Sigma0[1])',
 915
 916         '&rorx  ($a0,$a,$Sigma0[0])',
 917         '&lea   ($d,"($d,$h)")',        # d+=h
 918         '&and   ($a3,$a2)',             # (b^c)&(a^b)
 919         @aesni_cbc_block[$aesni_cbc_idx++].
 920         '&xor   ($a1,$a4)',
 921
 922         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
 923         '&xor   ($a1,$a0)',             # Sigma0(a)
 924         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
 925         '&mov   ($a4,$e)',              # copy of f in future
 926
 927         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 928         );
 929         # and at the finish one has to $a+=$a1
 930 }
 931
 932 $code.=<<___;
 933 .type   ${func}_avx2,\@function,6
 934 .align  64
 935 ${func}_avx2:
 936 .Lavx2_shortcut:
 937         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 938         push    %rbx
 939         push    %rbp
 940         push    %r12
 941         push    %r13
 942         push    %r14
 943         push    %r15
 944         mov     %rsp,%r11               # copy %rsp
 945         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
 946         and     \$-256*$SZ,%rsp         # align stack frame
 947         add     \$`2*$SZ*($rounds-8)`,%rsp
 948
 949         shl     \$6,$len
 950         sub     $inp,$out               # re-bias
 951         sub     $inp,$in0
 952         add     $inp,$len               # end of input
 953
 954         #mov    $inp,$_inp              # saved later
 955         #mov    $out,$_out              # kept in $offload
 956         mov     $len,$_end
 957         #mov    $key,$_key              # remains resident in $inp register
 958         mov     $ivp,$_ivp
 959         mov     $ctx,$_ctx
 960         mov     $in0,$_in0
 961         mov     %r11,$_rsp
 962 ___
 963 $code.=<<___ if ($win64);
 964         movaps  %xmm6,`$framesz+16*0`(%rsp)
 965         movaps  %xmm7,`$framesz+16*1`(%rsp)
 966         movaps  %xmm8,`$framesz+16*2`(%rsp)
 967         movaps  %xmm9,`$framesz+16*3`(%rsp)
 968         movaps  %xmm10,`$framesz+16*4`(%rsp)
 969         movaps  %xmm11,`$framesz+16*5`(%rsp)
 970         movaps  %xmm12,`$framesz+16*6`(%rsp)
 971         movaps  %xmm13,`$framesz+16*7`(%rsp)
 972         movaps  %xmm14,`$framesz+16*8`(%rsp)
 973         movaps  %xmm15,`$framesz+16*9`(%rsp)
 974 ___
 975 $code.=<<___;
 976 .Lprologue_avx2:
 977         vzeroall
 978
 979         mov     $inp,%r13               # borrow $a0
 980         vpinsrq \$1,$out,$offload,$offload
 981         lea     0x80($key),$inp         # size optimization, reassign
 982         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
 983         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 984         mov     $ctx,%r15               # borrow $a2
 985         mov     $in0,%rsi               # borrow $a3
 986         vmovdqu ($ivp),$iv              # load IV
 987         lea     -9(%r14),%r14
 988
 989         vmovdqa 0x00(%r12,%r14,8),$mask14
 990         vmovdqa 0x10(%r12,%r14,8),$mask12
 991         vmovdqa 0x20(%r12,%r14,8),$mask10
 992
 993         sub     \$-16*$SZ,%r13          # inp++, size optimization
 994         mov     $SZ*0(%r15),$A
 995         lea     (%rsi,%r13),%r12        # borrow $a0
 996         mov     $SZ*1(%r15),$B
 997         cmp     $len,%r13               # $_end
 998         mov     $SZ*2(%r15),$C
 999         cmove   %rsp,%r12               # next block or random data
1000         mov     $SZ*3(%r15),$D
1001         mov     $SZ*4(%r15),$E
1002         mov     $SZ*5(%r15),$F
1003         mov     $SZ*6(%r15),$G
1004         mov     $SZ*7(%r15),$H
1005         vmovdqu 0x00-0x80($inp),$roundkey
1006 ___
1007                                         if ($SZ==4) {   # SHA256
1008     my @X = map("%ymm$_",(0..3));
1009     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1010
1011 $code.=<<___;
1012         jmp     .Loop_avx2
1013 .align  16
1014 .Loop_avx2:
1015         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1016         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1017         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1018         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1019         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1020
1021         vinserti128     \$1,(%r12),@X[0],@X[0]
1022         vinserti128     \$1,16(%r12),@X[1],@X[1]
1023          vpshufb        $t3,@X[0],@X[0]
1024         vinserti128     \$1,32(%r12),@X[2],@X[2]
1025          vpshufb        $t3,@X[1],@X[1]
1026         vinserti128     \$1,48(%r12),@X[3],@X[3]
1027
1028         lea     $TABLE(%rip),$Tbl
1029         vpshufb $t3,@X[2],@X[2]
1030         lea     -16*$SZ(%r13),%r13
1031         vpaddd  0x00($Tbl),@X[0],$t0
1032         vpshufb $t3,@X[3],@X[3]
1033         vpaddd  0x20($Tbl),@X[1],$t1
1034         vpaddd  0x40($Tbl),@X[2],$t2
1035         vpaddd  0x60($Tbl),@X[3],$t3
1036         vmovdqa $t0,0x00(%rsp)
1037         xor     $a1,$a1
1038         vmovdqa $t1,0x20(%rsp)
1039         lea     -$PUSH8(%rsp),%rsp
1040         mov     $B,$a3
1041         vmovdqa $t2,0x00(%rsp)
1042         xor     $C,$a3                  # magic
1043         vmovdqa $t3,0x20(%rsp)
1044         mov     $F,$a4
1045         sub     \$-16*2*$SZ,$Tbl        # size optimization
1046         jmp     .Lavx2_00_47
1047
1048 .align  16
1049 .Lavx2_00_47:
1050         vmovdqu (%r13),$inout
1051         vpinsrq \$0,%r13,$offload,$offload
1052 ___
1053
1054 sub AVX2_256_00_47 () {
1055 my $j = shift;
1056 my $body = shift;
1057 my @X = @_;
1058 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1059 my $base = "+2*$PUSH8(%rsp)";
1060
1061         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1062         foreach (Xupdate_256_AVX()) {           # 29 instructions
1063             eval;
1064             eval(shift(@insns));
1065             eval(shift(@insns));
1066             eval(shift(@insns));
1067         }
1068         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1069           foreach (@insns) { eval; }            # remaining instructions
1070         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1071 }
1072     $aesni_cbc_idx=0;
1073     for ($i=0,$j=0; $j<4; $j++) {
1074         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1075         push(@X,shift(@X));                     # rotate(@X)
1076     }
1077         &vmovq          ("%r13",$offload);      # borrow $a0
1078         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1079         &vpand          ($temp,$temp,$mask14);
1080         &vpor           ($iv,$iv,$temp);
1081         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1082         &lea            ("%r13","16(%r13)");    # inp++
1083
1084         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1085         &cmpb   (($SZ-1)."($Tbl)",0);
1086         &jne    (".Lavx2_00_47");
1087
1088         &vmovdqu        ($inout,"(%r13)");
1089         &vpinsrq        ($offload,$offload,"%r13",0);
1090
1091     $aesni_cbc_idx=0;
1092     for ($i=0; $i<16; ) {
1093         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1094         foreach(bodyx_00_15()) { eval; }
1095     }
1096                                         }
1097 $code.=<<___;
1098         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1099         vmovq   $offload,%r13                   # $_inp, borrow $a0
1100         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1101         add     $a1,$A
1102         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1103
1104         vpand   $mask14,$temp,$temp
1105         vpor    $temp,$iv,$iv
1106         vmovdqu $iv,(%r12,%r13)                 # write output
1107         lea     16(%r13),%r13
1108
1109         add     $SZ*0(%r15),$A
1110         add     $SZ*1(%r15),$B
1111         add     $SZ*2(%r15),$C
1112         add     $SZ*3(%r15),$D
1113         add     $SZ*4(%r15),$E
1114         add     $SZ*5(%r15),$F
1115         add     $SZ*6(%r15),$G
1116         add     $SZ*7(%r15),$H
1117
1118         mov     $A,$SZ*0(%r15)
1119         mov     $B,$SZ*1(%r15)
1120         mov     $C,$SZ*2(%r15)
1121         mov     $D,$SZ*3(%r15)
1122         mov     $E,$SZ*4(%r15)
1123         mov     $F,$SZ*5(%r15)
1124         mov     $G,$SZ*6(%r15)
1125         mov     $H,$SZ*7(%r15)
1126
1127         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1128         je      .Ldone_avx2
1129
1130         xor     $a1,$a1
1131         mov     $B,$a3
1132         mov     $F,$a4
1133         xor     $C,$a3                  # magic
1134         jmp     .Lower_avx2
1135 .align  16
1136 .Lower_avx2:
1137         vmovdqu (%r13),$inout
1138         vpinsrq \$0,%r13,$offload,$offload
1139 ___
1140     $aesni_cbc_idx=0;
1141     for ($i=0; $i<16; ) {
1142         my $base="+16($Tbl)";
1143         foreach(bodyx_00_15()) { eval; }
1144         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1145     }
1146 $code.=<<___;
1147         vmovq   $offload,%r13                   # borrow $a0
1148         vpextrq \$1,$offload,%r15               # borrow $a2
1149         vpand   $mask14,$temp,$temp
1150         vpor    $temp,$iv,$iv
1151         lea     -$PUSH8($Tbl),$Tbl
1152         vmovdqu $iv,(%r15,%r13)                 # write output
1153         lea     16(%r13),%r13                   # inp++
1154         cmp     %rsp,$Tbl
1155         jae     .Lower_avx2
1156
1157         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1158         lea     16*$SZ(%r13),%r13
1159         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1160         add     $a1,$A
1161         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1162
1163         add     $SZ*0(%r15),$A
1164         add     $SZ*1(%r15),$B
1165         add     $SZ*2(%r15),$C
1166         add     $SZ*3(%r15),$D
1167         add     $SZ*4(%r15),$E
1168         add     $SZ*5(%r15),$F
1169         add     $SZ*6(%r15),$G
1170         lea     (%rsi,%r13),%r12
1171         add     $SZ*7(%r15),$H
1172
1173         cmp     $_end,%r13
1174
1175         mov     $A,$SZ*0(%r15)
1176         cmove   %rsp,%r12               # next block or stale data
1177         mov     $B,$SZ*1(%r15)
1178         mov     $C,$SZ*2(%r15)
1179         mov     $D,$SZ*3(%r15)
1180         mov     $E,$SZ*4(%r15)
1181         mov     $F,$SZ*5(%r15)
1182         mov     $G,$SZ*6(%r15)
1183         mov     $H,$SZ*7(%r15)
1184
1185         jbe     .Loop_avx2
1186         lea     (%rsp),$Tbl
1187
1188 .Ldone_avx2:
1189         lea     ($Tbl),%rsp
1190         mov     $_ivp,$ivp
1191         mov     $_rsp,%rsi
1192         vmovdqu $iv,($ivp)              # output IV
1193         vzeroall
1194 ___
1195 $code.=<<___ if ($win64);
1196         movaps  `$framesz+16*0`(%rsp),%xmm6
1197         movaps  `$framesz+16*1`(%rsp),%xmm7
1198         movaps  `$framesz+16*2`(%rsp),%xmm8
1199         movaps  `$framesz+16*3`(%rsp),%xmm9
1200         movaps  `$framesz+16*4`(%rsp),%xmm10
1201         movaps  `$framesz+16*5`(%rsp),%xmm11
1202         movaps  `$framesz+16*6`(%rsp),%xmm12
1203         movaps  `$framesz+16*7`(%rsp),%xmm13
1204         movaps  `$framesz+16*8`(%rsp),%xmm14
1205         movaps  `$framesz+16*9`(%rsp),%xmm15
1206 ___
1207 $code.=<<___;
1208         mov     (%rsi),%r15
1209         mov     8(%rsi),%r14
1210         mov     16(%rsi),%r13
1211         mov     24(%rsi),%r12
1212         mov     32(%rsi),%rbp
1213         mov     40(%rsi),%rbx
1214         lea     48(%rsi),%rsp
1215 .Lepilogue_avx2:
1216         ret
1217 .size   ${func}_avx2,.-${func}_avx2
1218 ___
1219 }}
1220 }}
1221 {{
1222 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1223
1224 my ($rounds,$Tbl)=("%r11d","%rbx");
1225
1226 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1227 my @rndkey=("%xmm4","%xmm5");
1228 my $r=0;
1229 my $sn=0;
1230
1231 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1232 my @MSG=map("%xmm$_",(10..13));
1233
1234 my $aesenc=sub {
1235   use integer;
1236   my ($n,$k)=($r/10,$r%10);
1237     if ($k==0) {
1238       $code.=<<___;
1239         movups          `16*$n`($in0),$in               # load input
1240         xorps           $rndkey0,$in
1241 ___
1242       $code.=<<___ if ($n);
1243         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1244 ___
1245       $code.=<<___;
1246         xorps           $in,$iv
1247         movups          `32+16*$k-112`($key),$rndkey[1]
1248         aesenc          $rndkey[0],$iv
1249 ___
1250     } elsif ($k==9) {
1251       $sn++;
1252       $code.=<<___;
1253         cmp             \$11,$rounds
1254         jb              .Laesenclast$sn
1255         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1256         aesenc          $rndkey[0],$iv
1257         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1258         aesenc          $rndkey[1],$iv
1259         je              .Laesenclast$sn
1260         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1261         aesenc          $rndkey[0],$iv
1262         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1263         aesenc          $rndkey[1],$iv
1264 .Laesenclast$sn:
1265         aesenclast      $rndkey[0],$iv
1266         movups          16-112($key),$rndkey[1]         # forward reference
1267         nop
1268 ___
1269     } else {
1270       $code.=<<___;
1271         movups          `32+16*$k-112`($key),$rndkey[1]
1272         aesenc          $rndkey[0],$iv
1273 ___
1274     }
1275     $r++;       unshift(@rndkey,pop(@rndkey));
1276 };
1277
1278 if ($shaext) {
1279 my $Tbl="%rax";
1280
1281 $code.=<<___;
1282 .type   ${func}_shaext,\@function,6
1283 .align  32
1284 ${func}_shaext:
1285         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1286 ___
1287 $code.=<<___ if ($win64);
1288         lea     `-8-10*16`(%rsp),%rsp
1289         movaps  %xmm6,-8-10*16(%rax)
1290         movaps  %xmm7,-8-9*16(%rax)
1291         movaps  %xmm8,-8-8*16(%rax)
1292         movaps  %xmm9,-8-7*16(%rax)
1293         movaps  %xmm10,-8-6*16(%rax)
1294         movaps  %xmm11,-8-5*16(%rax)
1295         movaps  %xmm12,-8-4*16(%rax)
1296         movaps  %xmm13,-8-3*16(%rax)
1297         movaps  %xmm14,-8-2*16(%rax)
1298         movaps  %xmm15,-8-1*16(%rax)
1299 .Lprologue_shaext:
1300 ___
1301 $code.=<<___;
1302         lea             K256+0x80(%rip),$Tbl
1303         movdqu          ($ctx),$ABEF            # DCBA
1304         movdqu          16($ctx),$CDGH          # HGFE
1305         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1306
1307         mov             240($key),$rounds
1308         sub             $in0,$out
1309         movups          ($key),$rndkey0         # $key[0]
1310         movups          16($key),$rndkey[0]     # forward reference
1311         lea             112($key),$key          # size optimization
1312
1313         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1314         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1315         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1316         movdqa          $TMP,$BSWAP             # offload
1317         palignr         \$8,$CDGH,$ABEF         # ABEF
1318         punpcklqdq      $Wi,$CDGH               # CDGH
1319
1320         jmp     .Loop_shaext
1321
1322 .align  16
1323 .Loop_shaext:
1324         movdqu          ($inp),@MSG[0]
1325         movdqu          0x10($inp),@MSG[1]
1326         movdqu          0x20($inp),@MSG[2]
1327         pshufb          $TMP,@MSG[0]
1328         movdqu          0x30($inp),@MSG[3]
1329
1330         movdqa          0*32-0x80($Tbl),$Wi
1331         paddd           @MSG[0],$Wi
1332         pshufb          $TMP,@MSG[1]
1333         movdqa          $CDGH,$CDGH_SAVE        # offload
1334         movdqa          $ABEF,$ABEF_SAVE        # offload
1335 ___
1336         &$aesenc();
1337 $code.=<<___;
1338         sha256rnds2     $ABEF,$CDGH             # 0-3
1339         pshufd          \$0x0e,$Wi,$Wi
1340 ___
1341         &$aesenc();
1342 $code.=<<___;
1343         sha256rnds2     $CDGH,$ABEF
1344
1345         movdqa          1*32-0x80($Tbl),$Wi
1346         paddd           @MSG[1],$Wi
1347         pshufb          $TMP,@MSG[2]
1348         lea             0x40($inp),$inp
1349 ___
1350         &$aesenc();
1351 $code.=<<___;
1352         sha256rnds2     $ABEF,$CDGH             # 4-7
1353         pshufd          \$0x0e,$Wi,$Wi
1354 ___
1355         &$aesenc();
1356 $code.=<<___;
1357         sha256rnds2     $CDGH,$ABEF
1358
1359         movdqa          2*32-0x80($Tbl),$Wi
1360         paddd           @MSG[2],$Wi
1361         pshufb          $TMP,@MSG[3]
1362         sha256msg1      @MSG[1],@MSG[0]
1363 ___
1364         &$aesenc();
1365 $code.=<<___;
1366         sha256rnds2     $ABEF,$CDGH             # 8-11
1367         pshufd          \$0x0e,$Wi,$Wi
1368         movdqa          @MSG[3],$TMP
1369         palignr         \$4,@MSG[2],$TMP
1370         paddd           $TMP,@MSG[0]
1371 ___
1372         &$aesenc();
1373 $code.=<<___;
1374         sha256rnds2     $CDGH,$ABEF
1375
1376         movdqa          3*32-0x80($Tbl),$Wi
1377         paddd           @MSG[3],$Wi
1378         sha256msg2      @MSG[3],@MSG[0]
1379         sha256msg1      @MSG[2],@MSG[1]
1380 ___
1381         &$aesenc();
1382 $code.=<<___;
1383         sha256rnds2     $ABEF,$CDGH             # 12-15
1384         pshufd          \$0x0e,$Wi,$Wi
1385 ___
1386         &$aesenc();
1387 $code.=<<___;
1388         movdqa          @MSG[0],$TMP
1389         palignr         \$4,@MSG[3],$TMP
1390         paddd           $TMP,@MSG[1]
1391         sha256rnds2     $CDGH,$ABEF
1392 ___
1393 for($i=4;$i<16-3;$i++) {
1394         &$aesenc()      if (($r%10)==0);
1395 $code.=<<___;
1396         movdqa          $i*32-0x80($Tbl),$Wi
1397         paddd           @MSG[0],$Wi
1398         sha256msg2      @MSG[0],@MSG[1]
1399         sha256msg1      @MSG[3],@MSG[2]
1400 ___
1401         &$aesenc();
1402 $code.=<<___;
1403         sha256rnds2     $ABEF,$CDGH             # 16-19...
1404         pshufd          \$0x0e,$Wi,$Wi
1405         movdqa          @MSG[1],$TMP
1406         palignr         \$4,@MSG[0],$TMP
1407         paddd           $TMP,@MSG[2]
1408 ___
1409         &$aesenc();
1410         &$aesenc()      if ($r==19);
1411 $code.=<<___;
1412         sha256rnds2     $CDGH,$ABEF
1413 ___
1414         push(@MSG,shift(@MSG));
1415 }
1416 $code.=<<___;
1417         movdqa          13*32-0x80($Tbl),$Wi
1418         paddd           @MSG[0],$Wi
1419         sha256msg2      @MSG[0],@MSG[1]
1420         sha256msg1      @MSG[3],@MSG[2]
1421 ___
1422         &$aesenc();
1423 $code.=<<___;
1424         sha256rnds2     $ABEF,$CDGH             # 52-55
1425         pshufd          \$0x0e,$Wi,$Wi
1426         movdqa          @MSG[1],$TMP
1427         palignr         \$4,@MSG[0],$TMP
1428         paddd           $TMP,@MSG[2]
1429 ___
1430         &$aesenc();
1431         &$aesenc();
1432 $code.=<<___;
1433         sha256rnds2     $CDGH,$ABEF
1434
1435         movdqa          14*32-0x80($Tbl),$Wi
1436         paddd           @MSG[1],$Wi
1437         sha256msg2      @MSG[1],@MSG[2]
1438         movdqa          $BSWAP,$TMP
1439 ___
1440         &$aesenc();
1441 $code.=<<___;
1442         sha256rnds2     $ABEF,$CDGH             # 56-59
1443         pshufd          \$0x0e,$Wi,$Wi
1444 ___
1445         &$aesenc();
1446 $code.=<<___;
1447         sha256rnds2     $CDGH,$ABEF
1448
1449         movdqa          15*32-0x80($Tbl),$Wi
1450         paddd           @MSG[2],$Wi
1451 ___
1452         &$aesenc();
1453         &$aesenc();
1454 $code.=<<___;
1455         sha256rnds2     $ABEF,$CDGH             # 60-63
1456         pshufd          \$0x0e,$Wi,$Wi
1457 ___
1458         &$aesenc();
1459 $code.=<<___;
1460         sha256rnds2     $CDGH,$ABEF
1461         #pxor           $CDGH,$rndkey0          # black magic
1462 ___
1463         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1464 $code.=<<___;
1465         #xorps          $CDGH,$rndkey0          # black magic
1466         paddd           $CDGH_SAVE,$CDGH
1467         paddd           $ABEF_SAVE,$ABEF
1468
1469         dec             $len
1470         movups          $iv,48($out,$in0)       # write output
1471         lea             64($in0),$in0
1472         jnz             .Loop_shaext
1473
1474         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1475         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1476         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1477         punpckhqdq      $CDGH,$ABEF             # DCBA
1478         palignr         \$8,$TMP,$CDGH          # HGFE
1479
1480         movups          $iv,($ivp)              # write IV
1481         movdqu          $ABEF,($ctx)
1482         movdqu          $CDGH,16($ctx)
1483 ___
1484 $code.=<<___ if ($win64);
1485         movaps  0*16(%rsp),%xmm6
1486         movaps  1*16(%rsp),%xmm7
1487         movaps  2*16(%rsp),%xmm8
1488         movaps  3*16(%rsp),%xmm9
1489         movaps  4*16(%rsp),%xmm10
1490         movaps  5*16(%rsp),%xmm11
1491         movaps  6*16(%rsp),%xmm12
1492         movaps  7*16(%rsp),%xmm13
1493         movaps  8*16(%rsp),%xmm14
1494         movaps  9*16(%rsp),%xmm15
1495         lea     8+10*16(%rsp),%rsp
1496 .Lepilogue_shaext:
1497 ___
1498 $code.=<<___;
1499         ret
1500 .size   ${func}_shaext,.-${func}_shaext
1501 ___
1502 }
1503 }}}}}
1504
1505 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1506 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1507 if ($win64 && $avx) {
1508 $rec="%rcx";
1509 $frame="%rdx";
1510 $context="%r8";
1511 $disp="%r9";
1512
1513 $code.=<<___;
1514 .extern __imp_RtlVirtualUnwind
1515 .type   se_handler,\@abi-omnipotent
1516 .align  16
1517 se_handler:
1518         push    %rsi
1519         push    %rdi
1520         push    %rbx
1521         push    %rbp
1522         push    %r12
1523         push    %r13
1524         push    %r14
1525         push    %r15
1526         pushfq
1527         sub     \$64,%rsp
1528
1529         mov     120($context),%rax      # pull context->Rax
1530         mov     248($context),%rbx      # pull context->Rip
1531
1532         mov     8($disp),%rsi           # disp->ImageBase
1533         mov     56($disp),%r11          # disp->HanderlData
1534
1535         mov     0(%r11),%r10d           # HandlerData[0]
1536         lea     (%rsi,%r10),%r10        # prologue label
1537         cmp     %r10,%rbx               # context->Rip<prologue label
1538         jb      .Lin_prologue
1539
1540         mov     152($context),%rax      # pull context->Rsp
1541
1542         mov     4(%r11),%r10d           # HandlerData[1]
1543         lea     (%rsi,%r10),%r10        # epilogue label
1544         cmp     %r10,%rbx               # context->Rip>=epilogue label
1545         jae     .Lin_prologue
1546 ___
1547 $code.=<<___ if ($shaext);
1548         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1549         cmp     %r10,%rbx
1550         jb      .Lnot_in_shaext
1551
1552         lea     (%rax),%rsi
1553         lea     512($context),%rdi      # &context.Xmm6
1554         mov     \$20,%ecx
1555         .long   0xa548f3fc              # cld; rep movsq
1556         lea     168(%rax),%rax          # adjust stack pointer
1557         jmp     .Lin_prologue
1558 .Lnot_in_shaext:
1559 ___
1560 $code.=<<___ if ($avx>1);
1561         lea     .Lavx2_shortcut(%rip),%r10
1562         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1563         jb      .Lnot_in_avx2
1564
1565         and     \$-256*$SZ,%rax
1566         add     \$`2*$SZ*($rounds-8)`,%rax
1567 .Lnot_in_avx2:
1568 ___
1569 $code.=<<___;
1570         mov     %rax,%rsi               # put aside Rsp
1571         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1572         lea     48(%rax),%rax
1573
1574         mov     -8(%rax),%rbx
1575         mov     -16(%rax),%rbp
1576         mov     -24(%rax),%r12
1577         mov     -32(%rax),%r13
1578         mov     -40(%rax),%r14
1579         mov     -48(%rax),%r15
1580         mov     %rbx,144($context)      # restore context->Rbx
1581         mov     %rbp,160($context)      # restore context->Rbp
1582         mov     %r12,216($context)      # restore context->R12
1583         mov     %r13,224($context)      # restore context->R13
1584         mov     %r14,232($context)      # restore context->R14
1585         mov     %r15,240($context)      # restore context->R15
1586
1587         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1588         lea     512($context),%rdi      # &context.Xmm6
1589         mov     \$20,%ecx
1590         .long   0xa548f3fc              # cld; rep movsq
1591
1592 .Lin_prologue:
1593         mov     8(%rax),%rdi
1594         mov     16(%rax),%rsi
1595         mov     %rax,152($context)      # restore context->Rsp
1596         mov     %rsi,168($context)      # restore context->Rsi
1597         mov     %rdi,176($context)      # restore context->Rdi
1598
1599         mov     40($disp),%rdi          # disp->ContextRecord
1600         mov     $context,%rsi           # context
1601         mov     \$154,%ecx              # sizeof(CONTEXT)
1602         .long   0xa548f3fc              # cld; rep movsq
1603
1604         mov     $disp,%rsi
1605         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1606         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1607         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1608         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1609         mov     40(%rsi),%r10           # disp->ContextRecord
1610         lea     56(%rsi),%r11           # &disp->HandlerData
1611         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1612         mov     %r10,32(%rsp)           # arg5
1613         mov     %r11,40(%rsp)           # arg6
1614         mov     %r12,48(%rsp)           # arg7
1615         mov     %rcx,56(%rsp)           # arg8, (NULL)
1616         call    *__imp_RtlVirtualUnwind(%rip)
1617
1618         mov     \$1,%eax                # ExceptionContinueSearch
1619         add     \$64,%rsp
1620         popfq
1621         pop     %r15
1622         pop     %r14
1623         pop     %r13
1624         pop     %r12
1625         pop     %rbp
1626         pop     %rbx
1627         pop     %rdi
1628         pop     %rsi
1629         ret
1630 .size   se_handler,.-se_handler
1631
1632 .section        .pdata
1633         .rva    .LSEH_begin_${func}_xop
1634         .rva    .LSEH_end_${func}_xop
1635         .rva    .LSEH_info_${func}_xop
1636
1637         .rva    .LSEH_begin_${func}_avx
1638         .rva    .LSEH_end_${func}_avx
1639         .rva    .LSEH_info_${func}_avx
1640 ___
1641 $code.=<<___ if ($avx>1);
1642         .rva    .LSEH_begin_${func}_avx2
1643         .rva    .LSEH_end_${func}_avx2
1644         .rva    .LSEH_info_${func}_avx2
1645 ___
1646 $code.=<<___ if ($shaext);
1647         .rva    .LSEH_begin_${func}_shaext
1648         .rva    .LSEH_end_${func}_shaext
1649         .rva    .LSEH_info_${func}_shaext
1650 ___
1651 $code.=<<___;
1652 .section        .xdata
1653 .align  8
1654 .LSEH_info_${func}_xop:
1655         .byte   9,0,0,0
1656         .rva    se_handler
1657         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1658
1659 .LSEH_info_${func}_avx:
1660         .byte   9,0,0,0
1661         .rva    se_handler
1662         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1663 ___
1664 $code.=<<___ if ($avx>1);
1665 .LSEH_info_${func}_avx2:
1666         .byte   9,0,0,0
1667         .rva    se_handler
1668         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1669 ___
1670 $code.=<<___ if ($shaext);
1671 .LSEH_info_${func}_shaext:
1672         .byte   9,0,0,0
1673         .rva    se_handler
1674         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1675 ___
1676 }
1677
1678 ####################################################################
1679 sub rex {
1680   local *opcode=shift;
1681   my ($dst,$src)=@_;
1682   my $rex=0;
1683
1684     $rex|=0x04                  if($dst>=8);
1685     $rex|=0x01                  if($src>=8);
1686     unshift @opcode,$rex|0x40   if($rex);
1687 }
1688
1689 {
1690   my %opcodelet = (
1691                 "sha256rnds2" => 0xcb,
1692                 "sha256msg1"  => 0xcc,
1693                 "sha256msg2"  => 0xcd   );
1694
1695   sub sha256op38 {
1696     my $instr = shift;
1697
1698     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1699       my @opcode=(0x0f,0x38);
1700         rex(\@opcode,$2,$1);
1701         push @opcode,$opcodelet{$instr};
1702         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1703         return ".byte\t".join(',',@opcode);
1704     } else {
1705         return $instr."\t".@_[0];
1706     }
1707   }
1708 }
1709
1710 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1711 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1712 print $code;
1713 close STDOUT;