crypto/modes/asm/aesni-gcm-x86_64.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 #
  18 # AES-NI-CTR+GHASH stitch.
  19 #
  20 # February 2013
  21 #
  22 # OpenSSL GCM implementation is organized in such way that its
  23 # performance is rather close to the sum of its streamed components,
  24 # in the context parallelized AES-NI CTR and modulo-scheduled
  25 # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
  26 # was observed to perform significantly better than the sum of the
  27 # components on contemporary CPUs, the effort was deemed impossible to
  28 # justify. This module is based on combination of Intel submissions,
  29 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
  30 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
  31 # pressure with notable relative improvement, achieving 1.0 cycle per
  32 # byte processed with 128-bit key on Haswell processor, 0.74 - on
  33 # Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
  34 # measurements for favourable packet size, one divisible by 96.
  35 # Applications using the EVP interface will observe a few percent
  36 # worse performance.]
  37 #
  38 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
  39 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
  40
  41 $flavour = shift;
  42 $output  = shift;
  43 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  44
  45 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  46
  47 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  48 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  49 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  50 die "can't locate x86_64-xlate.pl";
  51
  52 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  53                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  54         $avx = ($1>=2.20) + ($1>=2.22);
  55 }
  56
  57 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  58             `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  59         $avx = ($1>=2.09) + ($1>=2.10);
  60 }
  61
  62 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  63             `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  64         $avx = ($1>=10) + ($1>=11);
  65 }
  66
  67 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
  68         $avx = ($2>=3.0) + ($2>3.0);
  69 }
  70
  71 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  72 *STDOUT=*OUT;
  73
  74 if ($avx>1) {{{
  75
  76 ($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
  77
  78 ($Ii,$T1,$T2,$Hkey,
  79  $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
  80
  81 ($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
  82
  83 ($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
  84
  85 $code=<<___;
  86 .text
  87
  88 .type   _aesni_ctr32_ghash_6x,\@abi-omnipotent
  89 .align  32
  90 _aesni_ctr32_ghash_6x:
  91         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
  92         sub             \$6,$len
  93         vpxor           $Z0,$Z0,$Z0             # $Z0   = 0
  94         vmovdqu         0x00-0x80($key),$rndkey
  95         vpaddb          $T2,$T1,$inout1
  96         vpaddb          $T2,$inout1,$inout2
  97         vpaddb          $T2,$inout2,$inout3
  98         vpaddb          $T2,$inout3,$inout4
  99         vpaddb          $T2,$inout4,$inout5
 100         vpxor           $rndkey,$T1,$inout0
 101         vmovdqu         $Z0,16+8(%rsp)          # "$Z3" = 0
 102         jmp             .Loop6x
 103
 104 .align  32
 105 .Loop6x:
 106         add             \$`6<<24`,$counter
 107         jc              .Lhandle_ctr32          # discard $inout[1-5]?
 108         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
 109           vpaddb        $T2,$inout5,$T1         # next counter value
 110           vpxor         $rndkey,$inout1,$inout1
 111           vpxor         $rndkey,$inout2,$inout2
 112
 113 .Lresume_ctr32:
 114         vmovdqu         $T1,($ivp)              # save next counter value
 115         vpclmulqdq      \$0x10,$Hkey,$Z3,$Z1
 116           vpxor         $rndkey,$inout3,$inout3
 117           vmovups       0x10-0x80($key),$T2     # borrow $T2 for $rndkey
 118         vpclmulqdq      \$0x01,$Hkey,$Z3,$Z2
 119
 120         # At this point, the current block of 96 (0x60) bytes has already been
 121         # loaded into registers. Concurrently with processing it, we want to
 122         # load the next 96 bytes of input for the next round. Obviously, we can
 123         # only do this if there are at least 96 more bytes of input beyond the
 124         # input we're currently processing, or else we'd read past the end of
 125         # the input buffer. Here, we set |%r12| to 96 if there are at least 96
 126         # bytes of input beyond the 96 bytes we're already processing, and we
 127         # set |%r12| to 0 otherwise. In the case where we set |%r12| to 96,
 128         # we'll read in the next block so that it is in registers for the next
 129         # loop iteration. In the case where we set |%r12| to 0, we'll re-read
 130         # the current block and then ignore what we re-read.
 131         #
 132         # At this point, |$in0| points to the current (already read into
 133         # registers) block, and |$end0| points to 2*96 bytes before the end of
 134         # the input. Thus, |$in0| > |$end0| means that we do not have the next
 135         # 96-byte block to read in, and |$in0| <= |$end0| means we do.
 136         xor             %r12,%r12
 137         cmp             $in0,$end0
 138
 139           vaesenc       $T2,$inout0,$inout0
 140         vmovdqu         0x30+8(%rsp),$Ii        # I[4]
 141           vpxor         $rndkey,$inout4,$inout4
 142         vpclmulqdq      \$0x00,$Hkey,$Z3,$T1
 143           vaesenc       $T2,$inout1,$inout1
 144           vpxor         $rndkey,$inout5,$inout5
 145         setnc           %r12b
 146         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
 147           vaesenc       $T2,$inout2,$inout2
 148         vmovdqu         0x10-0x20($Xip),$Hkey   # $Hkey^2
 149         neg             %r12
 150           vaesenc       $T2,$inout3,$inout3
 151          vpxor          $Z1,$Z2,$Z2
 152         vpclmulqdq      \$0x00,$Hkey,$Ii,$Z1
 153          vpxor          $Z0,$Xi,$Xi             # modulo-scheduled
 154           vaesenc       $T2,$inout4,$inout4
 155          vpxor          $Z1,$T1,$Z0
 156         and             \$0x60,%r12
 157           vmovups       0x20-0x80($key),$rndkey
 158         vpclmulqdq      \$0x10,$Hkey,$Ii,$T1
 159           vaesenc       $T2,$inout5,$inout5
 160
 161         vpclmulqdq      \$0x01,$Hkey,$Ii,$T2
 162         lea             ($in0,%r12),$in0
 163           vaesenc       $rndkey,$inout0,$inout0
 164          vpxor          16+8(%rsp),$Xi,$Xi      # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
 165         vpclmulqdq      \$0x11,$Hkey,$Ii,$Hkey
 166          vmovdqu        0x40+8(%rsp),$Ii        # I[3]
 167           vaesenc       $rndkey,$inout1,$inout1
 168         movbe           0x58($in0),%r13
 169           vaesenc       $rndkey,$inout2,$inout2
 170         movbe           0x50($in0),%r12
 171           vaesenc       $rndkey,$inout3,$inout3
 172         mov             %r13,0x20+8(%rsp)
 173           vaesenc       $rndkey,$inout4,$inout4
 174         mov             %r12,0x28+8(%rsp)
 175         vmovdqu         0x30-0x20($Xip),$Z1     # borrow $Z1 for $Hkey^3
 176           vaesenc       $rndkey,$inout5,$inout5
 177
 178           vmovups       0x30-0x80($key),$rndkey
 179          vpxor          $T1,$Z2,$Z2
 180         vpclmulqdq      \$0x00,$Z1,$Ii,$T1
 181           vaesenc       $rndkey,$inout0,$inout0
 182          vpxor          $T2,$Z2,$Z2
 183         vpclmulqdq      \$0x10,$Z1,$Ii,$T2
 184           vaesenc       $rndkey,$inout1,$inout1
 185          vpxor          $Hkey,$Z3,$Z3
 186         vpclmulqdq      \$0x01,$Z1,$Ii,$Hkey
 187           vaesenc       $rndkey,$inout2,$inout2
 188         vpclmulqdq      \$0x11,$Z1,$Ii,$Z1
 189          vmovdqu        0x50+8(%rsp),$Ii        # I[2]
 190           vaesenc       $rndkey,$inout3,$inout3
 191           vaesenc       $rndkey,$inout4,$inout4
 192          vpxor          $T1,$Z0,$Z0
 193         vmovdqu         0x40-0x20($Xip),$T1     # borrow $T1 for $Hkey^4
 194           vaesenc       $rndkey,$inout5,$inout5
 195
 196           vmovups       0x40-0x80($key),$rndkey
 197          vpxor          $T2,$Z2,$Z2
 198         vpclmulqdq      \$0x00,$T1,$Ii,$T2
 199           vaesenc       $rndkey,$inout0,$inout0
 200          vpxor          $Hkey,$Z2,$Z2
 201         vpclmulqdq      \$0x10,$T1,$Ii,$Hkey
 202           vaesenc       $rndkey,$inout1,$inout1
 203         movbe           0x48($in0),%r13
 204          vpxor          $Z1,$Z3,$Z3
 205         vpclmulqdq      \$0x01,$T1,$Ii,$Z1
 206           vaesenc       $rndkey,$inout2,$inout2
 207         movbe           0x40($in0),%r12
 208         vpclmulqdq      \$0x11,$T1,$Ii,$T1
 209          vmovdqu        0x60+8(%rsp),$Ii        # I[1]
 210           vaesenc       $rndkey,$inout3,$inout3
 211         mov             %r13,0x30+8(%rsp)
 212           vaesenc       $rndkey,$inout4,$inout4
 213         mov             %r12,0x38+8(%rsp)
 214          vpxor          $T2,$Z0,$Z0
 215         vmovdqu         0x60-0x20($Xip),$T2     # borrow $T2 for $Hkey^5
 216           vaesenc       $rndkey,$inout5,$inout5
 217
 218           vmovups       0x50-0x80($key),$rndkey
 219          vpxor          $Hkey,$Z2,$Z2
 220         vpclmulqdq      \$0x00,$T2,$Ii,$Hkey
 221           vaesenc       $rndkey,$inout0,$inout0
 222          vpxor          $Z1,$Z2,$Z2
 223         vpclmulqdq      \$0x10,$T2,$Ii,$Z1
 224           vaesenc       $rndkey,$inout1,$inout1
 225         movbe           0x38($in0),%r13
 226          vpxor          $T1,$Z3,$Z3
 227         vpclmulqdq      \$0x01,$T2,$Ii,$T1
 228          vpxor          0x70+8(%rsp),$Xi,$Xi    # accumulate I[0]
 229           vaesenc       $rndkey,$inout2,$inout2
 230         movbe           0x30($in0),%r12
 231         vpclmulqdq      \$0x11,$T2,$Ii,$T2
 232           vaesenc       $rndkey,$inout3,$inout3
 233         mov             %r13,0x40+8(%rsp)
 234           vaesenc       $rndkey,$inout4,$inout4
 235         mov             %r12,0x48+8(%rsp)
 236          vpxor          $Hkey,$Z0,$Z0
 237          vmovdqu        0x70-0x20($Xip),$Hkey   # $Hkey^6
 238           vaesenc       $rndkey,$inout5,$inout5
 239
 240           vmovups       0x60-0x80($key),$rndkey
 241          vpxor          $Z1,$Z2,$Z2
 242         vpclmulqdq      \$0x10,$Hkey,$Xi,$Z1
 243           vaesenc       $rndkey,$inout0,$inout0
 244          vpxor          $T1,$Z2,$Z2
 245         vpclmulqdq      \$0x01,$Hkey,$Xi,$T1
 246           vaesenc       $rndkey,$inout1,$inout1
 247         movbe           0x28($in0),%r13
 248          vpxor          $T2,$Z3,$Z3
 249         vpclmulqdq      \$0x00,$Hkey,$Xi,$T2
 250           vaesenc       $rndkey,$inout2,$inout2
 251         movbe           0x20($in0),%r12
 252         vpclmulqdq      \$0x11,$Hkey,$Xi,$Xi
 253           vaesenc       $rndkey,$inout3,$inout3
 254         mov             %r13,0x50+8(%rsp)
 255           vaesenc       $rndkey,$inout4,$inout4
 256         mov             %r12,0x58+8(%rsp)
 257         vpxor           $Z1,$Z2,$Z2
 258           vaesenc       $rndkey,$inout5,$inout5
 259         vpxor           $T1,$Z2,$Z2
 260
 261           vmovups       0x70-0x80($key),$rndkey
 262         vpslldq         \$8,$Z2,$Z1
 263         vpxor           $T2,$Z0,$Z0
 264         vmovdqu         0x10($const),$Hkey      # .Lpoly
 265
 266           vaesenc       $rndkey,$inout0,$inout0
 267         vpxor           $Xi,$Z3,$Z3
 268           vaesenc       $rndkey,$inout1,$inout1
 269         vpxor           $Z1,$Z0,$Z0
 270         movbe           0x18($in0),%r13
 271           vaesenc       $rndkey,$inout2,$inout2
 272         movbe           0x10($in0),%r12
 273         vpalignr        \$8,$Z0,$Z0,$Ii         # 1st phase
 274         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
 275         mov             %r13,0x60+8(%rsp)
 276           vaesenc       $rndkey,$inout3,$inout3
 277         mov             %r12,0x68+8(%rsp)
 278           vaesenc       $rndkey,$inout4,$inout4
 279           vmovups       0x80-0x80($key),$T1     # borrow $T1 for $rndkey
 280           vaesenc       $rndkey,$inout5,$inout5
 281
 282           vaesenc       $T1,$inout0,$inout0
 283           vmovups       0x90-0x80($key),$rndkey
 284           vaesenc       $T1,$inout1,$inout1
 285         vpsrldq         \$8,$Z2,$Z2
 286           vaesenc       $T1,$inout2,$inout2
 287         vpxor           $Z2,$Z3,$Z3
 288           vaesenc       $T1,$inout3,$inout3
 289         vpxor           $Ii,$Z0,$Z0
 290         movbe           0x08($in0),%r13
 291           vaesenc       $T1,$inout4,$inout4
 292         movbe           0x00($in0),%r12
 293           vaesenc       $T1,$inout5,$inout5
 294           vmovups       0xa0-0x80($key),$T1
 295           cmp           \$11,$rounds
 296           jb            .Lenc_tail              # 128-bit key
 297
 298           vaesenc       $rndkey,$inout0,$inout0
 299           vaesenc       $rndkey,$inout1,$inout1
 300           vaesenc       $rndkey,$inout2,$inout2
 301           vaesenc       $rndkey,$inout3,$inout3
 302           vaesenc       $rndkey,$inout4,$inout4
 303           vaesenc       $rndkey,$inout5,$inout5
 304
 305           vaesenc       $T1,$inout0,$inout0
 306           vaesenc       $T1,$inout1,$inout1
 307           vaesenc       $T1,$inout2,$inout2
 308           vaesenc       $T1,$inout3,$inout3
 309           vaesenc       $T1,$inout4,$inout4
 310           vmovups       0xb0-0x80($key),$rndkey
 311           vaesenc       $T1,$inout5,$inout5
 312           vmovups       0xc0-0x80($key),$T1
 313           je            .Lenc_tail              # 192-bit key
 314
 315           vaesenc       $rndkey,$inout0,$inout0
 316           vaesenc       $rndkey,$inout1,$inout1
 317           vaesenc       $rndkey,$inout2,$inout2
 318           vaesenc       $rndkey,$inout3,$inout3
 319           vaesenc       $rndkey,$inout4,$inout4
 320           vaesenc       $rndkey,$inout5,$inout5
 321
 322           vaesenc       $T1,$inout0,$inout0
 323           vaesenc       $T1,$inout1,$inout1
 324           vaesenc       $T1,$inout2,$inout2
 325           vaesenc       $T1,$inout3,$inout3
 326           vaesenc       $T1,$inout4,$inout4
 327           vmovups       0xd0-0x80($key),$rndkey
 328           vaesenc       $T1,$inout5,$inout5
 329           vmovups       0xe0-0x80($key),$T1
 330           jmp           .Lenc_tail              # 256-bit key
 331
 332 .align  32
 333 .Lhandle_ctr32:
 334         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
 335           vpshufb       $Ii,$T1,$Z2             # byte-swap counter
 336           vmovdqu       0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
 337           vpaddd        0x40($const),$Z2,$inout1        # .Lone_lsb
 338           vpaddd        $Z1,$Z2,$inout2
 339         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
 340           vpaddd        $Z1,$inout1,$inout3
 341           vpshufb       $Ii,$inout1,$inout1
 342           vpaddd        $Z1,$inout2,$inout4
 343           vpshufb       $Ii,$inout2,$inout2
 344           vpxor         $rndkey,$inout1,$inout1
 345           vpaddd        $Z1,$inout3,$inout5
 346           vpshufb       $Ii,$inout3,$inout3
 347           vpxor         $rndkey,$inout2,$inout2
 348           vpaddd        $Z1,$inout4,$T1         # byte-swapped next counter value
 349           vpshufb       $Ii,$inout4,$inout4
 350           vpshufb       $Ii,$inout5,$inout5
 351           vpshufb       $Ii,$T1,$T1             # next counter value
 352         jmp             .Lresume_ctr32
 353
 354 .align  32
 355 .Lenc_tail:
 356           vaesenc       $rndkey,$inout0,$inout0
 357         vmovdqu         $Z3,16+8(%rsp)          # postpone vpxor $Z3,$Xi,$Xi
 358         vpalignr        \$8,$Z0,$Z0,$Xi         # 2nd phase
 359           vaesenc       $rndkey,$inout1,$inout1
 360         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
 361           vpxor         0x00($inp),$T1,$T2
 362           vaesenc       $rndkey,$inout2,$inout2
 363           vpxor         0x10($inp),$T1,$Ii
 364           vaesenc       $rndkey,$inout3,$inout3
 365           vpxor         0x20($inp),$T1,$Z1
 366           vaesenc       $rndkey,$inout4,$inout4
 367           vpxor         0x30($inp),$T1,$Z2
 368           vaesenc       $rndkey,$inout5,$inout5
 369           vpxor         0x40($inp),$T1,$Z3
 370           vpxor         0x50($inp),$T1,$Hkey
 371           vmovdqu       ($ivp),$T1              # load next counter value
 372
 373           vaesenclast   $T2,$inout0,$inout0
 374           vmovdqu       0x20($const),$T2        # borrow $T2, .Lone_msb
 375           vaesenclast   $Ii,$inout1,$inout1
 376          vpaddb         $T2,$T1,$Ii
 377         mov             %r13,0x70+8(%rsp)
 378         lea             0x60($inp),$inp
 379           vaesenclast   $Z1,$inout2,$inout2
 380          vpaddb         $T2,$Ii,$Z1
 381         mov             %r12,0x78+8(%rsp)
 382         lea             0x60($out),$out
 383           vmovdqu       0x00-0x80($key),$rndkey
 384           vaesenclast   $Z2,$inout3,$inout3
 385          vpaddb         $T2,$Z1,$Z2
 386           vaesenclast   $Z3, $inout4,$inout4
 387          vpaddb         $T2,$Z2,$Z3
 388           vaesenclast   $Hkey,$inout5,$inout5
 389          vpaddb         $T2,$Z3,$Hkey
 390
 391         add             \$0x60,$ret
 392         sub             \$0x6,$len
 393         jc              .L6x_done
 394
 395           vmovups       $inout0,-0x60($out)     # save output
 396          vpxor          $rndkey,$T1,$inout0
 397           vmovups       $inout1,-0x50($out)
 398          vmovdqa        $Ii,$inout1             # 0 latency
 399           vmovups       $inout2,-0x40($out)
 400          vmovdqa        $Z1,$inout2             # 0 latency
 401           vmovups       $inout3,-0x30($out)
 402          vmovdqa        $Z2,$inout3             # 0 latency
 403           vmovups       $inout4,-0x20($out)
 404          vmovdqa        $Z3,$inout4             # 0 latency
 405           vmovups       $inout5,-0x10($out)
 406          vmovdqa        $Hkey,$inout5           # 0 latency
 407         vmovdqu         0x20+8(%rsp),$Z3        # I[5]
 408         jmp             .Loop6x
 409
 410 .L6x_done:
 411         vpxor           16+8(%rsp),$Xi,$Xi      # modulo-scheduled
 412         vpxor           $Z0,$Xi,$Xi             # modulo-scheduled
 413
 414         ret
 415 .size   _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
 416 ___
 417 ######################################################################
 418 #
 419 # size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
 420 #               const AES_KEY *key, unsigned char iv[16],
 421 #               struct { u128 Xi,H,Htbl[9]; } *Xip);
 422 $code.=<<___;
 423 .globl  aesni_gcm_decrypt
 424 .type   aesni_gcm_decrypt,\@function,6
 425 .align  32
 426 aesni_gcm_decrypt:
 427         xor     $ret,$ret
 428
 429         # We call |_aesni_ctr32_ghash_6x|, which requires at least 96 (0x60)
 430         # bytes of input.
 431         cmp     \$0x60,$len                     # minimal accepted length
 432         jb      .Lgcm_dec_abort
 433
 434         lea     (%rsp),%rax                     # save stack pointer
 435         push    %rbx
 436         push    %rbp
 437         push    %r12
 438         push    %r13
 439         push    %r14
 440         push    %r15
 441 ___
 442 $code.=<<___ if ($win64);
 443         lea     -0xa8(%rsp),%rsp
 444         movaps  %xmm6,-0xd8(%rax)
 445         movaps  %xmm7,-0xc8(%rax)
 446         movaps  %xmm8,-0xb8(%rax)
 447         movaps  %xmm9,-0xa8(%rax)
 448         movaps  %xmm10,-0x98(%rax)
 449         movaps  %xmm11,-0x88(%rax)
 450         movaps  %xmm12,-0x78(%rax)
 451         movaps  %xmm13,-0x68(%rax)
 452         movaps  %xmm14,-0x58(%rax)
 453         movaps  %xmm15,-0x48(%rax)
 454 .Lgcm_dec_body:
 455 ___
 456 $code.=<<___;
 457         vzeroupper
 458
 459         vmovdqu         ($ivp),$T1              # input counter value
 460         add             \$-128,%rsp
 461         mov             12($ivp),$counter
 462         lea             .Lbswap_mask(%rip),$const
 463         lea             -0x80($key),$in0        # borrow $in0
 464         mov             \$0xf80,$end0           # borrow $end0
 465         vmovdqu         ($Xip),$Xi              # load Xi
 466         and             \$-128,%rsp             # ensure stack alignment
 467         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
 468         lea             0x80($key),$key         # size optimization
 469         lea             0x20+0x20($Xip),$Xip    # size optimization
 470         mov             0xf0-0x80($key),$rounds
 471         vpshufb         $Ii,$Xi,$Xi
 472
 473         and             $end0,$in0
 474         and             %rsp,$end0
 475         sub             $in0,$end0
 476         jc              .Ldec_no_key_aliasing
 477         cmp             \$768,$end0
 478         jnc             .Ldec_no_key_aliasing
 479         sub             $end0,%rsp              # avoid aliasing with key
 480 .Ldec_no_key_aliasing:
 481
 482         vmovdqu         0x50($inp),$Z3          # I[5]
 483         lea             ($inp),$in0
 484         vmovdqu         0x40($inp),$Z0
 485
 486         # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
 487         # bytes before the end of the input. Note, in particular, that this is
 488         # correct even if |$len| is not an even multiple of 96 or 16. XXX: This
 489         # seems to require that |$inp| + |$len| >= 2*96 (0xc0); i.e. |$inp| must
 490         # not be near the very beginning of the address space when |$len| < 2*96
 491         # (0xc0).
 492         lea             -0xc0($inp,$len),$end0
 493
 494         vmovdqu         0x30($inp),$Z1
 495         shr             \$4,$len
 496         xor             $ret,$ret
 497         vmovdqu         0x20($inp),$Z2
 498          vpshufb        $Ii,$Z3,$Z3             # passed to _aesni_ctr32_ghash_6x
 499         vmovdqu         0x10($inp),$T2
 500          vpshufb        $Ii,$Z0,$Z0
 501         vmovdqu         ($inp),$Hkey
 502          vpshufb        $Ii,$Z1,$Z1
 503         vmovdqu         $Z0,0x30(%rsp)
 504          vpshufb        $Ii,$Z2,$Z2
 505         vmovdqu         $Z1,0x40(%rsp)
 506          vpshufb        $Ii,$T2,$T2
 507         vmovdqu         $Z2,0x50(%rsp)
 508          vpshufb        $Ii,$Hkey,$Hkey
 509         vmovdqu         $T2,0x60(%rsp)
 510         vmovdqu         $Hkey,0x70(%rsp)
 511
 512         call            _aesni_ctr32_ghash_6x
 513
 514         vmovups         $inout0,-0x60($out)     # save output
 515         vmovups         $inout1,-0x50($out)
 516         vmovups         $inout2,-0x40($out)
 517         vmovups         $inout3,-0x30($out)
 518         vmovups         $inout4,-0x20($out)
 519         vmovups         $inout5,-0x10($out)
 520
 521         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
 522         vmovdqu         $Xi,-0x40($Xip)         # output Xi
 523
 524         vzeroupper
 525 ___
 526 $code.=<<___ if ($win64);
 527         movaps  -0xd8(%rax),%xmm6
 528         movaps  -0xc8(%rax),%xmm7
 529         movaps  -0xb8(%rax),%xmm8
 530         movaps  -0xa8(%rax),%xmm9
 531         movaps  -0x98(%rax),%xmm10
 532         movaps  -0x88(%rax),%xmm11
 533         movaps  -0x78(%rax),%xmm12
 534         movaps  -0x68(%rax),%xmm13
 535         movaps  -0x58(%rax),%xmm14
 536         movaps  -0x48(%rax),%xmm15
 537 ___
 538 $code.=<<___;
 539         mov     -48(%rax),%r15
 540         mov     -40(%rax),%r14
 541         mov     -32(%rax),%r13
 542         mov     -24(%rax),%r12
 543         mov     -16(%rax),%rbp
 544         mov     -8(%rax),%rbx
 545         lea     (%rax),%rsp             # restore %rsp
 546 .Lgcm_dec_abort:
 547         mov     $ret,%rax               # return value
 548         ret
 549 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
 550 ___
 551
 552 $code.=<<___;
 553 .type   _aesni_ctr32_6x,\@abi-omnipotent
 554 .align  32
 555 _aesni_ctr32_6x:
 556         vmovdqu         0x00-0x80($key),$Z0     # borrow $Z0 for $rndkey
 557         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
 558         lea             -1($rounds),%r13
 559         vmovups         0x10-0x80($key),$rndkey
 560         lea             0x20-0x80($key),%r12
 561         vpxor           $Z0,$T1,$inout0
 562         add             \$`6<<24`,$counter
 563         jc              .Lhandle_ctr32_2
 564         vpaddb          $T2,$T1,$inout1
 565         vpaddb          $T2,$inout1,$inout2
 566         vpxor           $Z0,$inout1,$inout1
 567         vpaddb          $T2,$inout2,$inout3
 568         vpxor           $Z0,$inout2,$inout2
 569         vpaddb          $T2,$inout3,$inout4
 570         vpxor           $Z0,$inout3,$inout3
 571         vpaddb          $T2,$inout4,$inout5
 572         vpxor           $Z0,$inout4,$inout4
 573         vpaddb          $T2,$inout5,$T1
 574         vpxor           $Z0,$inout5,$inout5
 575         jmp             .Loop_ctr32
 576
 577 .align  16
 578 .Loop_ctr32:
 579         vaesenc         $rndkey,$inout0,$inout0
 580         vaesenc         $rndkey,$inout1,$inout1
 581         vaesenc         $rndkey,$inout2,$inout2
 582         vaesenc         $rndkey,$inout3,$inout3
 583         vaesenc         $rndkey,$inout4,$inout4
 584         vaesenc         $rndkey,$inout5,$inout5
 585         vmovups         (%r12),$rndkey
 586         lea             0x10(%r12),%r12
 587         dec             %r13d
 588         jnz             .Loop_ctr32
 589
 590         vmovdqu         (%r12),$Hkey            # last round key
 591         vaesenc         $rndkey,$inout0,$inout0
 592         vpxor           0x00($inp),$Hkey,$Z0
 593         vaesenc         $rndkey,$inout1,$inout1
 594         vpxor           0x10($inp),$Hkey,$Z1
 595         vaesenc         $rndkey,$inout2,$inout2
 596         vpxor           0x20($inp),$Hkey,$Z2
 597         vaesenc         $rndkey,$inout3,$inout3
 598         vpxor           0x30($inp),$Hkey,$Xi
 599         vaesenc         $rndkey,$inout4,$inout4
 600         vpxor           0x40($inp),$Hkey,$T2
 601         vaesenc         $rndkey,$inout5,$inout5
 602         vpxor           0x50($inp),$Hkey,$Hkey
 603         lea             0x60($inp),$inp
 604
 605         vaesenclast     $Z0,$inout0,$inout0
 606         vaesenclast     $Z1,$inout1,$inout1
 607         vaesenclast     $Z2,$inout2,$inout2
 608         vaesenclast     $Xi,$inout3,$inout3
 609         vaesenclast     $T2,$inout4,$inout4
 610         vaesenclast     $Hkey,$inout5,$inout5
 611         vmovups         $inout0,0x00($out)
 612         vmovups         $inout1,0x10($out)
 613         vmovups         $inout2,0x20($out)
 614         vmovups         $inout3,0x30($out)
 615         vmovups         $inout4,0x40($out)
 616         vmovups         $inout5,0x50($out)
 617         lea             0x60($out),$out
 618
 619         ret
 620 .align  32
 621 .Lhandle_ctr32_2:
 622         vpshufb         $Ii,$T1,$Z2             # byte-swap counter
 623         vmovdqu         0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
 624         vpaddd          0x40($const),$Z2,$inout1        # .Lone_lsb
 625         vpaddd          $Z1,$Z2,$inout2
 626         vpaddd          $Z1,$inout1,$inout3
 627         vpshufb         $Ii,$inout1,$inout1
 628         vpaddd          $Z1,$inout2,$inout4
 629         vpshufb         $Ii,$inout2,$inout2
 630         vpxor           $Z0,$inout1,$inout1
 631         vpaddd          $Z1,$inout3,$inout5
 632         vpshufb         $Ii,$inout3,$inout3
 633         vpxor           $Z0,$inout2,$inout2
 634         vpaddd          $Z1,$inout4,$T1         # byte-swapped next counter value
 635         vpshufb         $Ii,$inout4,$inout4
 636         vpxor           $Z0,$inout3,$inout3
 637         vpshufb         $Ii,$inout5,$inout5
 638         vpxor           $Z0,$inout4,$inout4
 639         vpshufb         $Ii,$T1,$T1             # next counter value
 640         vpxor           $Z0,$inout5,$inout5
 641         jmp     .Loop_ctr32
 642 .size   _aesni_ctr32_6x,.-_aesni_ctr32_6x
 643
 644 .globl  aesni_gcm_encrypt
 645 .type   aesni_gcm_encrypt,\@function,6
 646 .align  32
 647 aesni_gcm_encrypt:
 648         xor     $ret,$ret
 649
 650         # We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of
 651         # input. Then we call |_aesni_ctr32_ghash_6x|, which requires at
 652         # least 96 more bytes of input.
 653         cmp     \$0x60*3,$len                   # minimal accepted length
 654         jb      .Lgcm_enc_abort
 655
 656         lea     (%rsp),%rax                     # save stack pointer
 657         push    %rbx
 658         push    %rbp
 659         push    %r12
 660         push    %r13
 661         push    %r14
 662         push    %r15
 663 ___
 664 $code.=<<___ if ($win64);
 665         lea     -0xa8(%rsp),%rsp
 666         movaps  %xmm6,-0xd8(%rax)
 667         movaps  %xmm7,-0xc8(%rax)
 668         movaps  %xmm8,-0xb8(%rax)
 669         movaps  %xmm9,-0xa8(%rax)
 670         movaps  %xmm10,-0x98(%rax)
 671         movaps  %xmm11,-0x88(%rax)
 672         movaps  %xmm12,-0x78(%rax)
 673         movaps  %xmm13,-0x68(%rax)
 674         movaps  %xmm14,-0x58(%rax)
 675         movaps  %xmm15,-0x48(%rax)
 676 .Lgcm_enc_body:
 677 ___
 678 $code.=<<___;
 679         vzeroupper
 680
 681         vmovdqu         ($ivp),$T1              # input counter value
 682         add             \$-128,%rsp
 683         mov             12($ivp),$counter
 684         lea             .Lbswap_mask(%rip),$const
 685         lea             -0x80($key),$in0        # borrow $in0
 686         mov             \$0xf80,$end0           # borrow $end0
 687         lea             0x80($key),$key         # size optimization
 688         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
 689         and             \$-128,%rsp             # ensure stack alignment
 690         mov             0xf0-0x80($key),$rounds
 691
 692         and             $end0,$in0
 693         and             %rsp,$end0
 694         sub             $in0,$end0
 695         jc              .Lenc_no_key_aliasing
 696         cmp             \$768,$end0
 697         jnc             .Lenc_no_key_aliasing
 698         sub             $end0,%rsp              # avoid aliasing with key
 699 .Lenc_no_key_aliasing:
 700
 701         lea             ($out),$in0
 702
 703         # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
 704         # bytes before the end of the input. Note, in particular, that this is
 705         # correct even if |$len| is not an even multiple of 96 or 16. Unlike in
 706         # the decryption case, there's no caveat that |$out| must not be near
 707         # the very beginning of the address space, because we know that
 708         # |$len| >= 3*96 from the check above, and so we know
 709         # |$out| + |$len| >= 2*96 (0xc0).
 710         lea             -0xc0($out,$len),$end0
 711
 712         shr             \$4,$len
 713
 714         call            _aesni_ctr32_6x
 715         vpshufb         $Ii,$inout0,$Xi         # save bswapped output on stack
 716         vpshufb         $Ii,$inout1,$T2
 717         vmovdqu         $Xi,0x70(%rsp)
 718         vpshufb         $Ii,$inout2,$Z0
 719         vmovdqu         $T2,0x60(%rsp)
 720         vpshufb         $Ii,$inout3,$Z1
 721         vmovdqu         $Z0,0x50(%rsp)
 722         vpshufb         $Ii,$inout4,$Z2
 723         vmovdqu         $Z1,0x40(%rsp)
 724         vpshufb         $Ii,$inout5,$Z3         # passed to _aesni_ctr32_ghash_6x
 725         vmovdqu         $Z2,0x30(%rsp)
 726
 727         call            _aesni_ctr32_6x
 728
 729         vmovdqu         ($Xip),$Xi              # load Xi
 730         lea             0x20+0x20($Xip),$Xip    # size optimization
 731         sub             \$12,$len
 732         mov             \$0x60*2,$ret
 733         vpshufb         $Ii,$Xi,$Xi
 734
 735         call            _aesni_ctr32_ghash_6x
 736         vmovdqu         0x20(%rsp),$Z3          # I[5]
 737          vmovdqu        ($const),$Ii            # borrow $Ii for .Lbswap_mask
 738         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
 739         vpunpckhqdq     $Z3,$Z3,$T1
 740         vmovdqu         0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
 741          vmovups        $inout0,-0x60($out)     # save output
 742          vpshufb        $Ii,$inout0,$inout0     # but keep bswapped copy
 743         vpxor           $Z3,$T1,$T1
 744          vmovups        $inout1,-0x50($out)
 745          vpshufb        $Ii,$inout1,$inout1
 746          vmovups        $inout2,-0x40($out)
 747          vpshufb        $Ii,$inout2,$inout2
 748          vmovups        $inout3,-0x30($out)
 749          vpshufb        $Ii,$inout3,$inout3
 750          vmovups        $inout4,-0x20($out)
 751          vpshufb        $Ii,$inout4,$inout4
 752          vmovups        $inout5,-0x10($out)
 753          vpshufb        $Ii,$inout5,$inout5
 754          vmovdqu        $inout0,0x10(%rsp)      # free $inout0
 755 ___
 756 { my ($HK,$T3)=($rndkey,$inout0);
 757
 758 $code.=<<___;
 759          vmovdqu        0x30(%rsp),$Z2          # I[4]
 760          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
 761          vpunpckhqdq    $Z2,$Z2,$T2
 762         vpclmulqdq      \$0x00,$Hkey,$Z3,$Z1
 763          vpxor          $Z2,$T2,$T2
 764         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
 765         vpclmulqdq      \$0x00,$HK,$T1,$T1
 766
 767          vmovdqu        0x40(%rsp),$T3          # I[3]
 768         vpclmulqdq      \$0x00,$Ii,$Z2,$Z0
 769          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
 770         vpxor           $Z1,$Z0,$Z0
 771          vpunpckhqdq    $T3,$T3,$Z1
 772         vpclmulqdq      \$0x11,$Ii,$Z2,$Z2
 773          vpxor          $T3,$Z1,$Z1
 774         vpxor           $Z3,$Z2,$Z2
 775         vpclmulqdq      \$0x10,$HK,$T2,$T2
 776          vmovdqu        0x50-0x20($Xip),$HK
 777         vpxor           $T1,$T2,$T2
 778
 779          vmovdqu        0x50(%rsp),$T1          # I[2]
 780         vpclmulqdq      \$0x00,$Hkey,$T3,$Z3
 781          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
 782         vpxor           $Z0,$Z3,$Z3
 783          vpunpckhqdq    $T1,$T1,$Z0
 784         vpclmulqdq      \$0x11,$Hkey,$T3,$T3
 785          vpxor          $T1,$Z0,$Z0
 786         vpxor           $Z2,$T3,$T3
 787         vpclmulqdq      \$0x00,$HK,$Z1,$Z1
 788         vpxor           $T2,$Z1,$Z1
 789
 790          vmovdqu        0x60(%rsp),$T2          # I[1]
 791         vpclmulqdq      \$0x00,$Ii,$T1,$Z2
 792          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
 793         vpxor           $Z3,$Z2,$Z2
 794          vpunpckhqdq    $T2,$T2,$Z3
 795         vpclmulqdq      \$0x11,$Ii,$T1,$T1
 796          vpxor          $T2,$Z3,$Z3
 797         vpxor           $T3,$T1,$T1
 798         vpclmulqdq      \$0x10,$HK,$Z0,$Z0
 799          vmovdqu        0x80-0x20($Xip),$HK
 800         vpxor           $Z1,$Z0,$Z0
 801
 802          vpxor          0x70(%rsp),$Xi,$Xi      # accumulate I[0]
 803         vpclmulqdq      \$0x00,$Hkey,$T2,$Z1
 804          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
 805          vpunpckhqdq    $Xi,$Xi,$T3
 806         vpxor           $Z2,$Z1,$Z1
 807         vpclmulqdq      \$0x11,$Hkey,$T2,$T2
 808          vpxor          $Xi,$T3,$T3
 809         vpxor           $T1,$T2,$T2
 810         vpclmulqdq      \$0x00,$HK,$Z3,$Z3
 811         vpxor           $Z0,$Z3,$Z0
 812
 813         vpclmulqdq      \$0x00,$Ii,$Xi,$Z2
 814          vmovdqu        0x00-0x20($Xip),$Hkey   # $Hkey^1
 815          vpunpckhqdq    $inout5,$inout5,$T1
 816         vpclmulqdq      \$0x11,$Ii,$Xi,$Xi
 817          vpxor          $inout5,$T1,$T1
 818         vpxor           $Z1,$Z2,$Z1
 819         vpclmulqdq      \$0x10,$HK,$T3,$T3
 820          vmovdqu        0x20-0x20($Xip),$HK
 821         vpxor           $T2,$Xi,$Z3
 822         vpxor           $Z0,$T3,$Z2
 823
 824          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
 825           vpxor         $Z1,$Z3,$T3             # aggregated Karatsuba post-processing
 826         vpclmulqdq      \$0x00,$Hkey,$inout5,$Z0
 827           vpxor         $T3,$Z2,$Z2
 828          vpunpckhqdq    $inout4,$inout4,$T2
 829         vpclmulqdq      \$0x11,$Hkey,$inout5,$inout5
 830          vpxor          $inout4,$T2,$T2
 831           vpslldq       \$8,$Z2,$T3
 832         vpclmulqdq      \$0x00,$HK,$T1,$T1
 833           vpxor         $T3,$Z1,$Xi
 834           vpsrldq       \$8,$Z2,$Z2
 835           vpxor         $Z2,$Z3,$Z3
 836
 837         vpclmulqdq      \$0x00,$Ii,$inout4,$Z1
 838          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
 839         vpxor           $Z0,$Z1,$Z1
 840          vpunpckhqdq    $inout3,$inout3,$T3
 841         vpclmulqdq      \$0x11,$Ii,$inout4,$inout4
 842          vpxor          $inout3,$T3,$T3
 843         vpxor           $inout5,$inout4,$inout4
 844           vpalignr      \$8,$Xi,$Xi,$inout5     # 1st phase
 845         vpclmulqdq      \$0x10,$HK,$T2,$T2
 846          vmovdqu        0x50-0x20($Xip),$HK
 847         vpxor           $T1,$T2,$T2
 848
 849         vpclmulqdq      \$0x00,$Hkey,$inout3,$Z0
 850          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
 851         vpxor           $Z1,$Z0,$Z0
 852          vpunpckhqdq    $inout2,$inout2,$T1
 853         vpclmulqdq      \$0x11,$Hkey,$inout3,$inout3
 854          vpxor          $inout2,$T1,$T1
 855         vpxor           $inout4,$inout3,$inout3
 856           vxorps        0x10(%rsp),$Z3,$Z3      # accumulate $inout0
 857         vpclmulqdq      \$0x00,$HK,$T3,$T3
 858         vpxor           $T2,$T3,$T3
 859
 860           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
 861           vxorps        $inout5,$Xi,$Xi
 862
 863         vpclmulqdq      \$0x00,$Ii,$inout2,$Z1
 864          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
 865         vpxor           $Z0,$Z1,$Z1
 866          vpunpckhqdq    $inout1,$inout1,$T2
 867         vpclmulqdq      \$0x11,$Ii,$inout2,$inout2
 868          vpxor          $inout1,$T2,$T2
 869           vpalignr      \$8,$Xi,$Xi,$inout5     # 2nd phase
 870         vpxor           $inout3,$inout2,$inout2
 871         vpclmulqdq      \$0x10,$HK,$T1,$T1
 872          vmovdqu        0x80-0x20($Xip),$HK
 873         vpxor           $T3,$T1,$T1
 874
 875           vxorps        $Z3,$inout5,$inout5
 876           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
 877           vxorps        $inout5,$Xi,$Xi
 878
 879         vpclmulqdq      \$0x00,$Hkey,$inout1,$Z0
 880          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
 881         vpxor           $Z1,$Z0,$Z0
 882          vpunpckhqdq    $Xi,$Xi,$T3
 883         vpclmulqdq      \$0x11,$Hkey,$inout1,$inout1
 884          vpxor          $Xi,$T3,$T3
 885         vpxor           $inout2,$inout1,$inout1
 886         vpclmulqdq      \$0x00,$HK,$T2,$T2
 887         vpxor           $T1,$T2,$T2
 888
 889         vpclmulqdq      \$0x00,$Ii,$Xi,$Z1
 890         vpclmulqdq      \$0x11,$Ii,$Xi,$Z3
 891         vpxor           $Z0,$Z1,$Z1
 892         vpclmulqdq      \$0x10,$HK,$T3,$Z2
 893         vpxor           $inout1,$Z3,$Z3
 894         vpxor           $T2,$Z2,$Z2
 895
 896         vpxor           $Z1,$Z3,$Z0             # aggregated Karatsuba post-processing
 897         vpxor           $Z0,$Z2,$Z2
 898         vpslldq         \$8,$Z2,$T1
 899         vmovdqu         0x10($const),$Hkey      # .Lpoly
 900         vpsrldq         \$8,$Z2,$Z2
 901         vpxor           $T1,$Z1,$Xi
 902         vpxor           $Z2,$Z3,$Z3
 903
 904         vpalignr        \$8,$Xi,$Xi,$T2         # 1st phase
 905         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
 906         vpxor           $T2,$Xi,$Xi
 907
 908         vpalignr        \$8,$Xi,$Xi,$T2         # 2nd phase
 909         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
 910         vpxor           $Z3,$T2,$T2
 911         vpxor           $T2,$Xi,$Xi
 912 ___
 913 }
 914 $code.=<<___;
 915         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
 916         vmovdqu         $Xi,-0x40($Xip)         # output Xi
 917
 918         vzeroupper
 919 ___
 920 $code.=<<___ if ($win64);
 921         movaps  -0xd8(%rax),%xmm6
 922         movaps  -0xc8(%rax),%xmm7
 923         movaps  -0xb8(%rax),%xmm8
 924         movaps  -0xa8(%rax),%xmm9
 925         movaps  -0x98(%rax),%xmm10
 926         movaps  -0x88(%rax),%xmm11
 927         movaps  -0x78(%rax),%xmm12
 928         movaps  -0x68(%rax),%xmm13
 929         movaps  -0x58(%rax),%xmm14
 930         movaps  -0x48(%rax),%xmm15
 931 ___
 932 $code.=<<___;
 933         mov     -48(%rax),%r15
 934         mov     -40(%rax),%r14
 935         mov     -32(%rax),%r13
 936         mov     -24(%rax),%r12
 937         mov     -16(%rax),%rbp
 938         mov     -8(%rax),%rbx
 939         lea     (%rax),%rsp             # restore %rsp
 940 .Lgcm_enc_abort:
 941         mov     $ret,%rax               # return value
 942         ret
 943 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
 944 ___
 945
 946 $code.=<<___;
 947 .align  64
 948 .Lbswap_mask:
 949         .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
 950 .Lpoly:
 951         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
 952 .Lone_msb:
 953         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
 954 .Ltwo_lsb:
 955         .byte   2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 956 .Lone_lsb:
 957         .byte   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
 958 .asciz  "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 959 .align  64
 960 ___
 961 if ($win64) {
 962 $rec="%rcx";
 963 $frame="%rdx";
 964 $context="%r8";
 965 $disp="%r9";
 966
 967 $code.=<<___
 968 .extern __imp_RtlVirtualUnwind
 969 .type   gcm_se_handler,\@abi-omnipotent
 970 .align  16
 971 gcm_se_handler:
 972         push    %rsi
 973         push    %rdi
 974         push    %rbx
 975         push    %rbp
 976         push    %r12
 977         push    %r13
 978         push    %r14
 979         push    %r15
 980         pushfq
 981         sub     \$64,%rsp
 982
 983         mov     120($context),%rax      # pull context->Rax
 984         mov     248($context),%rbx      # pull context->Rip
 985
 986         mov     8($disp),%rsi           # disp->ImageBase
 987         mov     56($disp),%r11          # disp->HandlerData
 988
 989         mov     0(%r11),%r10d           # HandlerData[0]
 990         lea     (%rsi,%r10),%r10        # prologue label
 991         cmp     %r10,%rbx               # context->Rip<prologue label
 992         jb      .Lcommon_seh_tail
 993
 994         mov     152($context),%rax      # pull context->Rsp
 995
 996         mov     4(%r11),%r10d           # HandlerData[1]
 997         lea     (%rsi,%r10),%r10        # epilogue label
 998         cmp     %r10,%rbx               # context->Rip>=epilogue label
 999         jae     .Lcommon_seh_tail
1000
1001         mov     120($context),%rax      # pull context->Rax
1002
1003         mov     -48(%rax),%r15
1004         mov     -40(%rax),%r14
1005         mov     -32(%rax),%r13
1006         mov     -24(%rax),%r12
1007         mov     -16(%rax),%rbp
1008         mov     -8(%rax),%rbx
1009         mov     %r15,240($context)
1010         mov     %r14,232($context)
1011         mov     %r13,224($context)
1012         mov     %r12,216($context)
1013         mov     %rbp,160($context)
1014         mov     %rbx,144($context)
1015
1016         lea     -0xd8(%rax),%rsi        # %xmm save area
1017         lea     512($context),%rdi      # & context.Xmm6
1018         mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
1019         .long   0xa548f3fc              # cld; rep movsq
1020
1021 .Lcommon_seh_tail:
1022         mov     8(%rax),%rdi
1023         mov     16(%rax),%rsi
1024         mov     %rax,152($context)      # restore context->Rsp
1025         mov     %rsi,168($context)      # restore context->Rsi
1026         mov     %rdi,176($context)      # restore context->Rdi
1027
1028         mov     40($disp),%rdi          # disp->ContextRecord
1029         mov     $context,%rsi           # context
1030         mov     \$154,%ecx              # sizeof(CONTEXT)
1031         .long   0xa548f3fc              # cld; rep movsq
1032
1033         mov     $disp,%rsi
1034         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1035         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1036         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1037         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1038         mov     40(%rsi),%r10           # disp->ContextRecord
1039         lea     56(%rsi),%r11           # &disp->HandlerData
1040         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1041         mov     %r10,32(%rsp)           # arg5
1042         mov     %r11,40(%rsp)           # arg6
1043         mov     %r12,48(%rsp)           # arg7
1044         mov     %rcx,56(%rsp)           # arg8, (NULL)
1045         call    *__imp_RtlVirtualUnwind(%rip)
1046
1047         mov     \$1,%eax                # ExceptionContinueSearch
1048         add     \$64,%rsp
1049         popfq
1050         pop     %r15
1051         pop     %r14
1052         pop     %r13
1053         pop     %r12
1054         pop     %rbp
1055         pop     %rbx
1056         pop     %rdi
1057         pop     %rsi
1058         ret
1059 .size   gcm_se_handler,.-gcm_se_handler
1060
1061 .section        .pdata
1062 .align  4
1063         .rva    .LSEH_begin_aesni_gcm_decrypt
1064         .rva    .LSEH_end_aesni_gcm_decrypt
1065         .rva    .LSEH_gcm_dec_info
1066
1067         .rva    .LSEH_begin_aesni_gcm_encrypt
1068         .rva    .LSEH_end_aesni_gcm_encrypt
1069         .rva    .LSEH_gcm_enc_info
1070 .section        .xdata
1071 .align  8
1072 .LSEH_gcm_dec_info:
1073         .byte   9,0,0,0
1074         .rva    gcm_se_handler
1075         .rva    .Lgcm_dec_body,.Lgcm_dec_abort
1076 .LSEH_gcm_enc_info:
1077         .byte   9,0,0,0
1078         .rva    gcm_se_handler
1079         .rva    .Lgcm_enc_body,.Lgcm_enc_abort
1080 ___
1081 }
1082 }}} else {{{
1083 $code=<<___;    # assembler is too old
1084 .text
1085
1086 .globl  aesni_gcm_encrypt
1087 .type   aesni_gcm_encrypt,\@abi-omnipotent
1088 aesni_gcm_encrypt:
1089         xor     %eax,%eax
1090         ret
1091 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
1092
1093 .globl  aesni_gcm_decrypt
1094 .type   aesni_gcm_decrypt,\@abi-omnipotent
1095 aesni_gcm_decrypt:
1096         xor     %eax,%eax
1097         ret
1098 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
1099 ___
1100 }}}
1101
1102 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1103
1104 print $code;
1105
1106 close STDOUT;