Check that ed25519 and ed448 are allowed by the security level
[oweals/openssl.git] / crypto / modes / asm / aesni-gcm-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 #
18 # AES-NI-CTR+GHASH stitch.
19 #
20 # February 2013
21 #
22 # OpenSSL GCM implementation is organized in such way that its
23 # performance is rather close to the sum of its streamed components,
24 # in the context parallelized AES-NI CTR and modulo-scheduled
25 # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
26 # was observed to perform significantly better than the sum of the
27 # components on contemporary CPUs, the effort was deemed impossible to
28 # justify. This module is based on combination of Intel submissions,
29 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
30 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
31 # pressure with notable relative improvement, achieving 1.0 cycle per
32 # byte processed with 128-bit key on Haswell processor, 0.74 - on
33 # Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
34 # measurements for favourable packet size, one divisible by 96.
35 # Applications using the EVP interface will observe a few percent
36 # worse performance.]
37 #
38 # Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
39 #
40 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
41 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
42
43 $flavour = shift;
44 $output  = shift;
45 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46
47 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52 die "can't locate x86_64-xlate.pl";
53
54 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
55                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
56         $avx = ($1>=2.20) + ($1>=2.22);
57 }
58
59 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
60             `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
61         $avx = ($1>=2.09) + ($1>=2.10);
62 }
63
64 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
65             `ml64 2>&1` =~ /Version ([0-9]+)\./) {
66         $avx = ($1>=10) + ($1>=11);
67 }
68
69 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
70         $avx = ($2>=3.0) + ($2>3.0);
71 }
72
73 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
74 *STDOUT=*OUT;
75
76 if ($avx>1) {{{
77
78 ($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
79
80 ($Ii,$T1,$T2,$Hkey,
81  $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
82
83 ($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
84
85 ($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
86
87 $code=<<___;
88 .text
89
90 .type   _aesni_ctr32_ghash_6x,\@abi-omnipotent
91 .align  32
92 _aesni_ctr32_ghash_6x:
93 .cfi_startproc
94         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
95         sub             \$6,$len
96         vpxor           $Z0,$Z0,$Z0             # $Z0   = 0
97         vmovdqu         0x00-0x80($key),$rndkey
98         vpaddb          $T2,$T1,$inout1
99         vpaddb          $T2,$inout1,$inout2
100         vpaddb          $T2,$inout2,$inout3
101         vpaddb          $T2,$inout3,$inout4
102         vpaddb          $T2,$inout4,$inout5
103         vpxor           $rndkey,$T1,$inout0
104         vmovdqu         $Z0,16+8(%rsp)          # "$Z3" = 0
105         jmp             .Loop6x
106
107 .align  32
108 .Loop6x:
109         add             \$`6<<24`,$counter
110         jc              .Lhandle_ctr32          # discard $inout[1-5]?
111         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
112           vpaddb        $T2,$inout5,$T1         # next counter value
113           vpxor         $rndkey,$inout1,$inout1
114           vpxor         $rndkey,$inout2,$inout2
115
116 .Lresume_ctr32:
117         vmovdqu         $T1,($ivp)              # save next counter value
118         vpclmulqdq      \$0x10,$Hkey,$Z3,$Z1
119           vpxor         $rndkey,$inout3,$inout3
120           vmovups       0x10-0x80($key),$T2     # borrow $T2 for $rndkey
121         vpclmulqdq      \$0x01,$Hkey,$Z3,$Z2
122         xor             %r12,%r12
123         cmp             $in0,$end0
124
125           vaesenc       $T2,$inout0,$inout0
126         vmovdqu         0x30+8(%rsp),$Ii        # I[4]
127           vpxor         $rndkey,$inout4,$inout4
128         vpclmulqdq      \$0x00,$Hkey,$Z3,$T1
129           vaesenc       $T2,$inout1,$inout1
130           vpxor         $rndkey,$inout5,$inout5
131         setnc           %r12b
132         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
133           vaesenc       $T2,$inout2,$inout2
134         vmovdqu         0x10-0x20($Xip),$Hkey   # $Hkey^2
135         neg             %r12
136           vaesenc       $T2,$inout3,$inout3
137          vpxor          $Z1,$Z2,$Z2
138         vpclmulqdq      \$0x00,$Hkey,$Ii,$Z1
139          vpxor          $Z0,$Xi,$Xi             # modulo-scheduled
140           vaesenc       $T2,$inout4,$inout4
141          vpxor          $Z1,$T1,$Z0
142         and             \$0x60,%r12
143           vmovups       0x20-0x80($key),$rndkey
144         vpclmulqdq      \$0x10,$Hkey,$Ii,$T1
145           vaesenc       $T2,$inout5,$inout5
146
147         vpclmulqdq      \$0x01,$Hkey,$Ii,$T2
148         lea             ($in0,%r12),$in0
149           vaesenc       $rndkey,$inout0,$inout0
150          vpxor          16+8(%rsp),$Xi,$Xi      # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
151         vpclmulqdq      \$0x11,$Hkey,$Ii,$Hkey
152          vmovdqu        0x40+8(%rsp),$Ii        # I[3]
153           vaesenc       $rndkey,$inout1,$inout1
154         movbe           0x58($in0),%r13
155           vaesenc       $rndkey,$inout2,$inout2
156         movbe           0x50($in0),%r12
157           vaesenc       $rndkey,$inout3,$inout3
158         mov             %r13,0x20+8(%rsp)
159           vaesenc       $rndkey,$inout4,$inout4
160         mov             %r12,0x28+8(%rsp)
161         vmovdqu         0x30-0x20($Xip),$Z1     # borrow $Z1 for $Hkey^3
162           vaesenc       $rndkey,$inout5,$inout5
163
164           vmovups       0x30-0x80($key),$rndkey
165          vpxor          $T1,$Z2,$Z2
166         vpclmulqdq      \$0x00,$Z1,$Ii,$T1
167           vaesenc       $rndkey,$inout0,$inout0
168          vpxor          $T2,$Z2,$Z2
169         vpclmulqdq      \$0x10,$Z1,$Ii,$T2
170           vaesenc       $rndkey,$inout1,$inout1
171          vpxor          $Hkey,$Z3,$Z3
172         vpclmulqdq      \$0x01,$Z1,$Ii,$Hkey
173           vaesenc       $rndkey,$inout2,$inout2
174         vpclmulqdq      \$0x11,$Z1,$Ii,$Z1
175          vmovdqu        0x50+8(%rsp),$Ii        # I[2]
176           vaesenc       $rndkey,$inout3,$inout3
177           vaesenc       $rndkey,$inout4,$inout4
178          vpxor          $T1,$Z0,$Z0
179         vmovdqu         0x40-0x20($Xip),$T1     # borrow $T1 for $Hkey^4
180           vaesenc       $rndkey,$inout5,$inout5
181
182           vmovups       0x40-0x80($key),$rndkey
183          vpxor          $T2,$Z2,$Z2
184         vpclmulqdq      \$0x00,$T1,$Ii,$T2
185           vaesenc       $rndkey,$inout0,$inout0
186          vpxor          $Hkey,$Z2,$Z2
187         vpclmulqdq      \$0x10,$T1,$Ii,$Hkey
188           vaesenc       $rndkey,$inout1,$inout1
189         movbe           0x48($in0),%r13
190          vpxor          $Z1,$Z3,$Z3
191         vpclmulqdq      \$0x01,$T1,$Ii,$Z1
192           vaesenc       $rndkey,$inout2,$inout2
193         movbe           0x40($in0),%r12
194         vpclmulqdq      \$0x11,$T1,$Ii,$T1
195          vmovdqu        0x60+8(%rsp),$Ii        # I[1]
196           vaesenc       $rndkey,$inout3,$inout3
197         mov             %r13,0x30+8(%rsp)
198           vaesenc       $rndkey,$inout4,$inout4
199         mov             %r12,0x38+8(%rsp)
200          vpxor          $T2,$Z0,$Z0
201         vmovdqu         0x60-0x20($Xip),$T2     # borrow $T2 for $Hkey^5
202           vaesenc       $rndkey,$inout5,$inout5
203
204           vmovups       0x50-0x80($key),$rndkey
205          vpxor          $Hkey,$Z2,$Z2
206         vpclmulqdq      \$0x00,$T2,$Ii,$Hkey
207           vaesenc       $rndkey,$inout0,$inout0
208          vpxor          $Z1,$Z2,$Z2
209         vpclmulqdq      \$0x10,$T2,$Ii,$Z1
210           vaesenc       $rndkey,$inout1,$inout1
211         movbe           0x38($in0),%r13
212          vpxor          $T1,$Z3,$Z3
213         vpclmulqdq      \$0x01,$T2,$Ii,$T1
214          vpxor          0x70+8(%rsp),$Xi,$Xi    # accumulate I[0]
215           vaesenc       $rndkey,$inout2,$inout2
216         movbe           0x30($in0),%r12
217         vpclmulqdq      \$0x11,$T2,$Ii,$T2
218           vaesenc       $rndkey,$inout3,$inout3
219         mov             %r13,0x40+8(%rsp)
220           vaesenc       $rndkey,$inout4,$inout4
221         mov             %r12,0x48+8(%rsp)
222          vpxor          $Hkey,$Z0,$Z0
223          vmovdqu        0x70-0x20($Xip),$Hkey   # $Hkey^6
224           vaesenc       $rndkey,$inout5,$inout5
225
226           vmovups       0x60-0x80($key),$rndkey
227          vpxor          $Z1,$Z2,$Z2
228         vpclmulqdq      \$0x10,$Hkey,$Xi,$Z1
229           vaesenc       $rndkey,$inout0,$inout0
230          vpxor          $T1,$Z2,$Z2
231         vpclmulqdq      \$0x01,$Hkey,$Xi,$T1
232           vaesenc       $rndkey,$inout1,$inout1
233         movbe           0x28($in0),%r13
234          vpxor          $T2,$Z3,$Z3
235         vpclmulqdq      \$0x00,$Hkey,$Xi,$T2
236           vaesenc       $rndkey,$inout2,$inout2
237         movbe           0x20($in0),%r12
238         vpclmulqdq      \$0x11,$Hkey,$Xi,$Xi
239           vaesenc       $rndkey,$inout3,$inout3
240         mov             %r13,0x50+8(%rsp)
241           vaesenc       $rndkey,$inout4,$inout4
242         mov             %r12,0x58+8(%rsp)
243         vpxor           $Z1,$Z2,$Z2
244           vaesenc       $rndkey,$inout5,$inout5
245         vpxor           $T1,$Z2,$Z2
246
247           vmovups       0x70-0x80($key),$rndkey
248         vpslldq         \$8,$Z2,$Z1
249         vpxor           $T2,$Z0,$Z0
250         vmovdqu         0x10($const),$Hkey      # .Lpoly
251
252           vaesenc       $rndkey,$inout0,$inout0
253         vpxor           $Xi,$Z3,$Z3
254           vaesenc       $rndkey,$inout1,$inout1
255         vpxor           $Z1,$Z0,$Z0
256         movbe           0x18($in0),%r13
257           vaesenc       $rndkey,$inout2,$inout2
258         movbe           0x10($in0),%r12
259         vpalignr        \$8,$Z0,$Z0,$Ii         # 1st phase
260         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
261         mov             %r13,0x60+8(%rsp)
262           vaesenc       $rndkey,$inout3,$inout3
263         mov             %r12,0x68+8(%rsp)
264           vaesenc       $rndkey,$inout4,$inout4
265           vmovups       0x80-0x80($key),$T1     # borrow $T1 for $rndkey
266           vaesenc       $rndkey,$inout5,$inout5
267
268           vaesenc       $T1,$inout0,$inout0
269           vmovups       0x90-0x80($key),$rndkey
270           vaesenc       $T1,$inout1,$inout1
271         vpsrldq         \$8,$Z2,$Z2
272           vaesenc       $T1,$inout2,$inout2
273         vpxor           $Z2,$Z3,$Z3
274           vaesenc       $T1,$inout3,$inout3
275         vpxor           $Ii,$Z0,$Z0
276         movbe           0x08($in0),%r13
277           vaesenc       $T1,$inout4,$inout4
278         movbe           0x00($in0),%r12
279           vaesenc       $T1,$inout5,$inout5
280           vmovups       0xa0-0x80($key),$T1
281           cmp           \$11,$rounds
282           jb            .Lenc_tail              # 128-bit key
283
284           vaesenc       $rndkey,$inout0,$inout0
285           vaesenc       $rndkey,$inout1,$inout1
286           vaesenc       $rndkey,$inout2,$inout2
287           vaesenc       $rndkey,$inout3,$inout3
288           vaesenc       $rndkey,$inout4,$inout4
289           vaesenc       $rndkey,$inout5,$inout5
290
291           vaesenc       $T1,$inout0,$inout0
292           vaesenc       $T1,$inout1,$inout1
293           vaesenc       $T1,$inout2,$inout2
294           vaesenc       $T1,$inout3,$inout3
295           vaesenc       $T1,$inout4,$inout4
296           vmovups       0xb0-0x80($key),$rndkey
297           vaesenc       $T1,$inout5,$inout5
298           vmovups       0xc0-0x80($key),$T1
299           je            .Lenc_tail              # 192-bit key
300
301           vaesenc       $rndkey,$inout0,$inout0
302           vaesenc       $rndkey,$inout1,$inout1
303           vaesenc       $rndkey,$inout2,$inout2
304           vaesenc       $rndkey,$inout3,$inout3
305           vaesenc       $rndkey,$inout4,$inout4
306           vaesenc       $rndkey,$inout5,$inout5
307
308           vaesenc       $T1,$inout0,$inout0
309           vaesenc       $T1,$inout1,$inout1
310           vaesenc       $T1,$inout2,$inout2
311           vaesenc       $T1,$inout3,$inout3
312           vaesenc       $T1,$inout4,$inout4
313           vmovups       0xd0-0x80($key),$rndkey
314           vaesenc       $T1,$inout5,$inout5
315           vmovups       0xe0-0x80($key),$T1
316           jmp           .Lenc_tail              # 256-bit key
317
318 .align  32
319 .Lhandle_ctr32:
320         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
321           vpshufb       $Ii,$T1,$Z2             # byte-swap counter
322           vmovdqu       0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
323           vpaddd        0x40($const),$Z2,$inout1        # .Lone_lsb
324           vpaddd        $Z1,$Z2,$inout2
325         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
326           vpaddd        $Z1,$inout1,$inout3
327           vpshufb       $Ii,$inout1,$inout1
328           vpaddd        $Z1,$inout2,$inout4
329           vpshufb       $Ii,$inout2,$inout2
330           vpxor         $rndkey,$inout1,$inout1
331           vpaddd        $Z1,$inout3,$inout5
332           vpshufb       $Ii,$inout3,$inout3
333           vpxor         $rndkey,$inout2,$inout2
334           vpaddd        $Z1,$inout4,$T1         # byte-swapped next counter value
335           vpshufb       $Ii,$inout4,$inout4
336           vpshufb       $Ii,$inout5,$inout5
337           vpshufb       $Ii,$T1,$T1             # next counter value
338         jmp             .Lresume_ctr32
339
340 .align  32
341 .Lenc_tail:
342           vaesenc       $rndkey,$inout0,$inout0
343         vmovdqu         $Z3,16+8(%rsp)          # postpone vpxor $Z3,$Xi,$Xi
344         vpalignr        \$8,$Z0,$Z0,$Xi         # 2nd phase
345           vaesenc       $rndkey,$inout1,$inout1
346         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
347           vpxor         0x00($inp),$T1,$T2
348           vaesenc       $rndkey,$inout2,$inout2
349           vpxor         0x10($inp),$T1,$Ii
350           vaesenc       $rndkey,$inout3,$inout3
351           vpxor         0x20($inp),$T1,$Z1
352           vaesenc       $rndkey,$inout4,$inout4
353           vpxor         0x30($inp),$T1,$Z2
354           vaesenc       $rndkey,$inout5,$inout5
355           vpxor         0x40($inp),$T1,$Z3
356           vpxor         0x50($inp),$T1,$Hkey
357           vmovdqu       ($ivp),$T1              # load next counter value
358
359           vaesenclast   $T2,$inout0,$inout0
360           vmovdqu       0x20($const),$T2        # borrow $T2, .Lone_msb
361           vaesenclast   $Ii,$inout1,$inout1
362          vpaddb         $T2,$T1,$Ii
363         mov             %r13,0x70+8(%rsp)
364         lea             0x60($inp),$inp
365           vaesenclast   $Z1,$inout2,$inout2
366          vpaddb         $T2,$Ii,$Z1
367         mov             %r12,0x78+8(%rsp)
368         lea             0x60($out),$out
369           vmovdqu       0x00-0x80($key),$rndkey
370           vaesenclast   $Z2,$inout3,$inout3
371          vpaddb         $T2,$Z1,$Z2
372           vaesenclast   $Z3, $inout4,$inout4
373          vpaddb         $T2,$Z2,$Z3
374           vaesenclast   $Hkey,$inout5,$inout5
375          vpaddb         $T2,$Z3,$Hkey
376
377         add             \$0x60,$ret
378         sub             \$0x6,$len
379         jc              .L6x_done
380
381           vmovups       $inout0,-0x60($out)     # save output
382          vpxor          $rndkey,$T1,$inout0
383           vmovups       $inout1,-0x50($out)
384          vmovdqa        $Ii,$inout1             # 0 latency
385           vmovups       $inout2,-0x40($out)
386          vmovdqa        $Z1,$inout2             # 0 latency
387           vmovups       $inout3,-0x30($out)
388          vmovdqa        $Z2,$inout3             # 0 latency
389           vmovups       $inout4,-0x20($out)
390          vmovdqa        $Z3,$inout4             # 0 latency
391           vmovups       $inout5,-0x10($out)
392          vmovdqa        $Hkey,$inout5           # 0 latency
393         vmovdqu         0x20+8(%rsp),$Z3        # I[5]
394         jmp             .Loop6x
395
396 .L6x_done:
397         vpxor           16+8(%rsp),$Xi,$Xi      # modulo-scheduled
398         vpxor           $Z0,$Xi,$Xi             # modulo-scheduled
399
400         ret
401 .cfi_endproc
402 .size   _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
403 ___
404 ######################################################################
405 #
406 # size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
407 #               const AES_KEY *key, unsigned char iv[16],
408 #               struct { u128 Xi,H,Htbl[9]; } *Xip);
409 $code.=<<___;
410 .globl  aesni_gcm_decrypt
411 .type   aesni_gcm_decrypt,\@function,6
412 .align  32
413 aesni_gcm_decrypt:
414 .cfi_startproc
415         xor     $ret,$ret
416         cmp     \$0x60,$len                     # minimal accepted length
417         jb      .Lgcm_dec_abort
418
419         lea     (%rsp),%rax                     # save stack pointer
420 .cfi_def_cfa_register   %rax
421         push    %rbx
422 .cfi_push       %rbx
423         push    %rbp
424 .cfi_push       %rbp
425         push    %r12
426 .cfi_push       %r12
427         push    %r13
428 .cfi_push       %r13
429         push    %r14
430 .cfi_push       %r14
431         push    %r15
432 .cfi_push       %r15
433 ___
434 $code.=<<___ if ($win64);
435         lea     -0xa8(%rsp),%rsp
436         movaps  %xmm6,-0xd8(%rax)
437         movaps  %xmm7,-0xc8(%rax)
438         movaps  %xmm8,-0xb8(%rax)
439         movaps  %xmm9,-0xa8(%rax)
440         movaps  %xmm10,-0x98(%rax)
441         movaps  %xmm11,-0x88(%rax)
442         movaps  %xmm12,-0x78(%rax)
443         movaps  %xmm13,-0x68(%rax)
444         movaps  %xmm14,-0x58(%rax)
445         movaps  %xmm15,-0x48(%rax)
446 .Lgcm_dec_body:
447 ___
448 $code.=<<___;
449         vzeroupper
450
451         vmovdqu         ($ivp),$T1              # input counter value
452         add             \$-128,%rsp
453         mov             12($ivp),$counter
454         lea             .Lbswap_mask(%rip),$const
455         lea             -0x80($key),$in0        # borrow $in0
456         mov             \$0xf80,$end0           # borrow $end0
457         vmovdqu         ($Xip),$Xi              # load Xi
458         and             \$-128,%rsp             # ensure stack alignment
459         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
460         lea             0x80($key),$key         # size optimization
461         lea             0x20+0x20($Xip),$Xip    # size optimization
462         mov             0xf0-0x80($key),$rounds
463         vpshufb         $Ii,$Xi,$Xi
464
465         and             $end0,$in0
466         and             %rsp,$end0
467         sub             $in0,$end0
468         jc              .Ldec_no_key_aliasing
469         cmp             \$768,$end0
470         jnc             .Ldec_no_key_aliasing
471         sub             $end0,%rsp              # avoid aliasing with key
472 .Ldec_no_key_aliasing:
473
474         vmovdqu         0x50($inp),$Z3          # I[5]
475         lea             ($inp),$in0
476         vmovdqu         0x40($inp),$Z0
477         lea             -0xc0($inp,$len),$end0
478         vmovdqu         0x30($inp),$Z1
479         shr             \$4,$len
480         xor             $ret,$ret
481         vmovdqu         0x20($inp),$Z2
482          vpshufb        $Ii,$Z3,$Z3             # passed to _aesni_ctr32_ghash_6x
483         vmovdqu         0x10($inp),$T2
484          vpshufb        $Ii,$Z0,$Z0
485         vmovdqu         ($inp),$Hkey
486          vpshufb        $Ii,$Z1,$Z1
487         vmovdqu         $Z0,0x30(%rsp)
488          vpshufb        $Ii,$Z2,$Z2
489         vmovdqu         $Z1,0x40(%rsp)
490          vpshufb        $Ii,$T2,$T2
491         vmovdqu         $Z2,0x50(%rsp)
492          vpshufb        $Ii,$Hkey,$Hkey
493         vmovdqu         $T2,0x60(%rsp)
494         vmovdqu         $Hkey,0x70(%rsp)
495
496         call            _aesni_ctr32_ghash_6x
497
498         vmovups         $inout0,-0x60($out)     # save output
499         vmovups         $inout1,-0x50($out)
500         vmovups         $inout2,-0x40($out)
501         vmovups         $inout3,-0x30($out)
502         vmovups         $inout4,-0x20($out)
503         vmovups         $inout5,-0x10($out)
504
505         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
506         vmovdqu         $Xi,-0x40($Xip)         # output Xi
507
508         vzeroupper
509 ___
510 $code.=<<___ if ($win64);
511         movaps  -0xd8(%rax),%xmm6
512         movaps  -0xc8(%rax),%xmm7
513         movaps  -0xb8(%rax),%xmm8
514         movaps  -0xa8(%rax),%xmm9
515         movaps  -0x98(%rax),%xmm10
516         movaps  -0x88(%rax),%xmm11
517         movaps  -0x78(%rax),%xmm12
518         movaps  -0x68(%rax),%xmm13
519         movaps  -0x58(%rax),%xmm14
520         movaps  -0x48(%rax),%xmm15
521 ___
522 $code.=<<___;
523         mov     -48(%rax),%r15
524 .cfi_restore    %r15
525         mov     -40(%rax),%r14
526 .cfi_restore    %r14
527         mov     -32(%rax),%r13
528 .cfi_restore    %r13
529         mov     -24(%rax),%r12
530 .cfi_restore    %r12
531         mov     -16(%rax),%rbp
532 .cfi_restore    %rbp
533         mov     -8(%rax),%rbx
534 .cfi_restore    %rbx
535         lea     (%rax),%rsp             # restore %rsp
536 .cfi_def_cfa_register   %rsp
537 .Lgcm_dec_abort:
538         mov     $ret,%rax               # return value
539         ret
540 .cfi_endproc
541 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
542 ___
543
544 $code.=<<___;
545 .type   _aesni_ctr32_6x,\@abi-omnipotent
546 .align  32
547 _aesni_ctr32_6x:
548 .cfi_startproc
549         vmovdqu         0x00-0x80($key),$Z0     # borrow $Z0 for $rndkey
550         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
551         lea             -1($rounds),%r13
552         vmovups         0x10-0x80($key),$rndkey
553         lea             0x20-0x80($key),%r12
554         vpxor           $Z0,$T1,$inout0
555         add             \$`6<<24`,$counter
556         jc              .Lhandle_ctr32_2
557         vpaddb          $T2,$T1,$inout1
558         vpaddb          $T2,$inout1,$inout2
559         vpxor           $Z0,$inout1,$inout1
560         vpaddb          $T2,$inout2,$inout3
561         vpxor           $Z0,$inout2,$inout2
562         vpaddb          $T2,$inout3,$inout4
563         vpxor           $Z0,$inout3,$inout3
564         vpaddb          $T2,$inout4,$inout5
565         vpxor           $Z0,$inout4,$inout4
566         vpaddb          $T2,$inout5,$T1
567         vpxor           $Z0,$inout5,$inout5
568         jmp             .Loop_ctr32
569
570 .align  16
571 .Loop_ctr32:
572         vaesenc         $rndkey,$inout0,$inout0
573         vaesenc         $rndkey,$inout1,$inout1
574         vaesenc         $rndkey,$inout2,$inout2
575         vaesenc         $rndkey,$inout3,$inout3
576         vaesenc         $rndkey,$inout4,$inout4
577         vaesenc         $rndkey,$inout5,$inout5
578         vmovups         (%r12),$rndkey
579         lea             0x10(%r12),%r12
580         dec             %r13d
581         jnz             .Loop_ctr32
582
583         vmovdqu         (%r12),$Hkey            # last round key
584         vaesenc         $rndkey,$inout0,$inout0
585         vpxor           0x00($inp),$Hkey,$Z0
586         vaesenc         $rndkey,$inout1,$inout1
587         vpxor           0x10($inp),$Hkey,$Z1
588         vaesenc         $rndkey,$inout2,$inout2
589         vpxor           0x20($inp),$Hkey,$Z2
590         vaesenc         $rndkey,$inout3,$inout3
591         vpxor           0x30($inp),$Hkey,$Xi
592         vaesenc         $rndkey,$inout4,$inout4
593         vpxor           0x40($inp),$Hkey,$T2
594         vaesenc         $rndkey,$inout5,$inout5
595         vpxor           0x50($inp),$Hkey,$Hkey
596         lea             0x60($inp),$inp
597
598         vaesenclast     $Z0,$inout0,$inout0
599         vaesenclast     $Z1,$inout1,$inout1
600         vaesenclast     $Z2,$inout2,$inout2
601         vaesenclast     $Xi,$inout3,$inout3
602         vaesenclast     $T2,$inout4,$inout4
603         vaesenclast     $Hkey,$inout5,$inout5
604         vmovups         $inout0,0x00($out)
605         vmovups         $inout1,0x10($out)
606         vmovups         $inout2,0x20($out)
607         vmovups         $inout3,0x30($out)
608         vmovups         $inout4,0x40($out)
609         vmovups         $inout5,0x50($out)
610         lea             0x60($out),$out
611
612         ret
613 .align  32
614 .Lhandle_ctr32_2:
615         vpshufb         $Ii,$T1,$Z2             # byte-swap counter
616         vmovdqu         0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
617         vpaddd          0x40($const),$Z2,$inout1        # .Lone_lsb
618         vpaddd          $Z1,$Z2,$inout2
619         vpaddd          $Z1,$inout1,$inout3
620         vpshufb         $Ii,$inout1,$inout1
621         vpaddd          $Z1,$inout2,$inout4
622         vpshufb         $Ii,$inout2,$inout2
623         vpxor           $Z0,$inout1,$inout1
624         vpaddd          $Z1,$inout3,$inout5
625         vpshufb         $Ii,$inout3,$inout3
626         vpxor           $Z0,$inout2,$inout2
627         vpaddd          $Z1,$inout4,$T1         # byte-swapped next counter value
628         vpshufb         $Ii,$inout4,$inout4
629         vpxor           $Z0,$inout3,$inout3
630         vpshufb         $Ii,$inout5,$inout5
631         vpxor           $Z0,$inout4,$inout4
632         vpshufb         $Ii,$T1,$T1             # next counter value
633         vpxor           $Z0,$inout5,$inout5
634         jmp     .Loop_ctr32
635 .cfi_endproc
636 .size   _aesni_ctr32_6x,.-_aesni_ctr32_6x
637
638 .globl  aesni_gcm_encrypt
639 .type   aesni_gcm_encrypt,\@function,6
640 .align  32
641 aesni_gcm_encrypt:
642 .cfi_startproc
643         xor     $ret,$ret
644         cmp     \$0x60*3,$len                   # minimal accepted length
645         jb      .Lgcm_enc_abort
646
647         lea     (%rsp),%rax                     # save stack pointer
648 .cfi_def_cfa_register   %rax
649         push    %rbx
650 .cfi_push       %rbx
651         push    %rbp
652 .cfi_push       %rbp
653         push    %r12
654 .cfi_push       %r12
655         push    %r13
656 .cfi_push       %r13
657         push    %r14
658 .cfi_push       %r14
659         push    %r15
660 .cfi_push       %r15
661 ___
662 $code.=<<___ if ($win64);
663         lea     -0xa8(%rsp),%rsp
664         movaps  %xmm6,-0xd8(%rax)
665         movaps  %xmm7,-0xc8(%rax)
666         movaps  %xmm8,-0xb8(%rax)
667         movaps  %xmm9,-0xa8(%rax)
668         movaps  %xmm10,-0x98(%rax)
669         movaps  %xmm11,-0x88(%rax)
670         movaps  %xmm12,-0x78(%rax)
671         movaps  %xmm13,-0x68(%rax)
672         movaps  %xmm14,-0x58(%rax)
673         movaps  %xmm15,-0x48(%rax)
674 .Lgcm_enc_body:
675 ___
676 $code.=<<___;
677         vzeroupper
678
679         vmovdqu         ($ivp),$T1              # input counter value
680         add             \$-128,%rsp
681         mov             12($ivp),$counter
682         lea             .Lbswap_mask(%rip),$const
683         lea             -0x80($key),$in0        # borrow $in0
684         mov             \$0xf80,$end0           # borrow $end0
685         lea             0x80($key),$key         # size optimization
686         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
687         and             \$-128,%rsp             # ensure stack alignment
688         mov             0xf0-0x80($key),$rounds
689
690         and             $end0,$in0
691         and             %rsp,$end0
692         sub             $in0,$end0
693         jc              .Lenc_no_key_aliasing
694         cmp             \$768,$end0
695         jnc             .Lenc_no_key_aliasing
696         sub             $end0,%rsp              # avoid aliasing with key
697 .Lenc_no_key_aliasing:
698
699         lea             ($out),$in0
700         lea             -0xc0($out,$len),$end0
701         shr             \$4,$len
702
703         call            _aesni_ctr32_6x
704         vpshufb         $Ii,$inout0,$Xi         # save bswapped output on stack
705         vpshufb         $Ii,$inout1,$T2
706         vmovdqu         $Xi,0x70(%rsp)
707         vpshufb         $Ii,$inout2,$Z0
708         vmovdqu         $T2,0x60(%rsp)
709         vpshufb         $Ii,$inout3,$Z1
710         vmovdqu         $Z0,0x50(%rsp)
711         vpshufb         $Ii,$inout4,$Z2
712         vmovdqu         $Z1,0x40(%rsp)
713         vpshufb         $Ii,$inout5,$Z3         # passed to _aesni_ctr32_ghash_6x
714         vmovdqu         $Z2,0x30(%rsp)
715
716         call            _aesni_ctr32_6x
717
718         vmovdqu         ($Xip),$Xi              # load Xi
719         lea             0x20+0x20($Xip),$Xip    # size optimization
720         sub             \$12,$len
721         mov             \$0x60*2,$ret
722         vpshufb         $Ii,$Xi,$Xi
723
724         call            _aesni_ctr32_ghash_6x
725         vmovdqu         0x20(%rsp),$Z3          # I[5]
726          vmovdqu        ($const),$Ii            # borrow $Ii for .Lbswap_mask
727         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
728         vpunpckhqdq     $Z3,$Z3,$T1
729         vmovdqu         0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
730          vmovups        $inout0,-0x60($out)     # save output
731          vpshufb        $Ii,$inout0,$inout0     # but keep bswapped copy
732         vpxor           $Z3,$T1,$T1
733          vmovups        $inout1,-0x50($out)
734          vpshufb        $Ii,$inout1,$inout1
735          vmovups        $inout2,-0x40($out)
736          vpshufb        $Ii,$inout2,$inout2
737          vmovups        $inout3,-0x30($out)
738          vpshufb        $Ii,$inout3,$inout3
739          vmovups        $inout4,-0x20($out)
740          vpshufb        $Ii,$inout4,$inout4
741          vmovups        $inout5,-0x10($out)
742          vpshufb        $Ii,$inout5,$inout5
743          vmovdqu        $inout0,0x10(%rsp)      # free $inout0
744 ___
745 { my ($HK,$T3)=($rndkey,$inout0);
746
747 $code.=<<___;
748          vmovdqu        0x30(%rsp),$Z2          # I[4]
749          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
750          vpunpckhqdq    $Z2,$Z2,$T2
751         vpclmulqdq      \$0x00,$Hkey,$Z3,$Z1
752          vpxor          $Z2,$T2,$T2
753         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
754         vpclmulqdq      \$0x00,$HK,$T1,$T1
755
756          vmovdqu        0x40(%rsp),$T3          # I[3]
757         vpclmulqdq      \$0x00,$Ii,$Z2,$Z0
758          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
759         vpxor           $Z1,$Z0,$Z0
760          vpunpckhqdq    $T3,$T3,$Z1
761         vpclmulqdq      \$0x11,$Ii,$Z2,$Z2
762          vpxor          $T3,$Z1,$Z1
763         vpxor           $Z3,$Z2,$Z2
764         vpclmulqdq      \$0x10,$HK,$T2,$T2
765          vmovdqu        0x50-0x20($Xip),$HK
766         vpxor           $T1,$T2,$T2
767
768          vmovdqu        0x50(%rsp),$T1          # I[2]
769         vpclmulqdq      \$0x00,$Hkey,$T3,$Z3
770          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
771         vpxor           $Z0,$Z3,$Z3
772          vpunpckhqdq    $T1,$T1,$Z0
773         vpclmulqdq      \$0x11,$Hkey,$T3,$T3
774          vpxor          $T1,$Z0,$Z0
775         vpxor           $Z2,$T3,$T3
776         vpclmulqdq      \$0x00,$HK,$Z1,$Z1
777         vpxor           $T2,$Z1,$Z1
778
779          vmovdqu        0x60(%rsp),$T2          # I[1]
780         vpclmulqdq      \$0x00,$Ii,$T1,$Z2
781          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
782         vpxor           $Z3,$Z2,$Z2
783          vpunpckhqdq    $T2,$T2,$Z3
784         vpclmulqdq      \$0x11,$Ii,$T1,$T1
785          vpxor          $T2,$Z3,$Z3
786         vpxor           $T3,$T1,$T1
787         vpclmulqdq      \$0x10,$HK,$Z0,$Z0
788          vmovdqu        0x80-0x20($Xip),$HK
789         vpxor           $Z1,$Z0,$Z0
790
791          vpxor          0x70(%rsp),$Xi,$Xi      # accumulate I[0]
792         vpclmulqdq      \$0x00,$Hkey,$T2,$Z1
793          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
794          vpunpckhqdq    $Xi,$Xi,$T3
795         vpxor           $Z2,$Z1,$Z1
796         vpclmulqdq      \$0x11,$Hkey,$T2,$T2
797          vpxor          $Xi,$T3,$T3
798         vpxor           $T1,$T2,$T2
799         vpclmulqdq      \$0x00,$HK,$Z3,$Z3
800         vpxor           $Z0,$Z3,$Z0
801
802         vpclmulqdq      \$0x00,$Ii,$Xi,$Z2
803          vmovdqu        0x00-0x20($Xip),$Hkey   # $Hkey^1
804          vpunpckhqdq    $inout5,$inout5,$T1
805         vpclmulqdq      \$0x11,$Ii,$Xi,$Xi
806          vpxor          $inout5,$T1,$T1
807         vpxor           $Z1,$Z2,$Z1
808         vpclmulqdq      \$0x10,$HK,$T3,$T3
809          vmovdqu        0x20-0x20($Xip),$HK
810         vpxor           $T2,$Xi,$Z3
811         vpxor           $Z0,$T3,$Z2
812
813          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
814           vpxor         $Z1,$Z3,$T3             # aggregated Karatsuba post-processing
815         vpclmulqdq      \$0x00,$Hkey,$inout5,$Z0
816           vpxor         $T3,$Z2,$Z2
817          vpunpckhqdq    $inout4,$inout4,$T2
818         vpclmulqdq      \$0x11,$Hkey,$inout5,$inout5
819          vpxor          $inout4,$T2,$T2
820           vpslldq       \$8,$Z2,$T3
821         vpclmulqdq      \$0x00,$HK,$T1,$T1
822           vpxor         $T3,$Z1,$Xi
823           vpsrldq       \$8,$Z2,$Z2
824           vpxor         $Z2,$Z3,$Z3
825
826         vpclmulqdq      \$0x00,$Ii,$inout4,$Z1
827          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
828         vpxor           $Z0,$Z1,$Z1
829          vpunpckhqdq    $inout3,$inout3,$T3
830         vpclmulqdq      \$0x11,$Ii,$inout4,$inout4
831          vpxor          $inout3,$T3,$T3
832         vpxor           $inout5,$inout4,$inout4
833           vpalignr      \$8,$Xi,$Xi,$inout5     # 1st phase
834         vpclmulqdq      \$0x10,$HK,$T2,$T2
835          vmovdqu        0x50-0x20($Xip),$HK
836         vpxor           $T1,$T2,$T2
837
838         vpclmulqdq      \$0x00,$Hkey,$inout3,$Z0
839          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
840         vpxor           $Z1,$Z0,$Z0
841          vpunpckhqdq    $inout2,$inout2,$T1
842         vpclmulqdq      \$0x11,$Hkey,$inout3,$inout3
843          vpxor          $inout2,$T1,$T1
844         vpxor           $inout4,$inout3,$inout3
845           vxorps        0x10(%rsp),$Z3,$Z3      # accumulate $inout0
846         vpclmulqdq      \$0x00,$HK,$T3,$T3
847         vpxor           $T2,$T3,$T3
848
849           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
850           vxorps        $inout5,$Xi,$Xi
851
852         vpclmulqdq      \$0x00,$Ii,$inout2,$Z1
853          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
854         vpxor           $Z0,$Z1,$Z1
855          vpunpckhqdq    $inout1,$inout1,$T2
856         vpclmulqdq      \$0x11,$Ii,$inout2,$inout2
857          vpxor          $inout1,$T2,$T2
858           vpalignr      \$8,$Xi,$Xi,$inout5     # 2nd phase
859         vpxor           $inout3,$inout2,$inout2
860         vpclmulqdq      \$0x10,$HK,$T1,$T1
861          vmovdqu        0x80-0x20($Xip),$HK
862         vpxor           $T3,$T1,$T1
863
864           vxorps        $Z3,$inout5,$inout5
865           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
866           vxorps        $inout5,$Xi,$Xi
867
868         vpclmulqdq      \$0x00,$Hkey,$inout1,$Z0
869          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
870         vpxor           $Z1,$Z0,$Z0
871          vpunpckhqdq    $Xi,$Xi,$T3
872         vpclmulqdq      \$0x11,$Hkey,$inout1,$inout1
873          vpxor          $Xi,$T3,$T3
874         vpxor           $inout2,$inout1,$inout1
875         vpclmulqdq      \$0x00,$HK,$T2,$T2
876         vpxor           $T1,$T2,$T2
877
878         vpclmulqdq      \$0x00,$Ii,$Xi,$Z1
879         vpclmulqdq      \$0x11,$Ii,$Xi,$Z3
880         vpxor           $Z0,$Z1,$Z1
881         vpclmulqdq      \$0x10,$HK,$T3,$Z2
882         vpxor           $inout1,$Z3,$Z3
883         vpxor           $T2,$Z2,$Z2
884
885         vpxor           $Z1,$Z3,$Z0             # aggregated Karatsuba post-processing
886         vpxor           $Z0,$Z2,$Z2
887         vpslldq         \$8,$Z2,$T1
888         vmovdqu         0x10($const),$Hkey      # .Lpoly
889         vpsrldq         \$8,$Z2,$Z2
890         vpxor           $T1,$Z1,$Xi
891         vpxor           $Z2,$Z3,$Z3
892
893         vpalignr        \$8,$Xi,$Xi,$T2         # 1st phase
894         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
895         vpxor           $T2,$Xi,$Xi
896
897         vpalignr        \$8,$Xi,$Xi,$T2         # 2nd phase
898         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
899         vpxor           $Z3,$T2,$T2
900         vpxor           $T2,$Xi,$Xi
901 ___
902 }
903 $code.=<<___;
904         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
905         vmovdqu         $Xi,-0x40($Xip)         # output Xi
906
907         vzeroupper
908 ___
909 $code.=<<___ if ($win64);
910         movaps  -0xd8(%rax),%xmm6
911         movaps  -0xc8(%rax),%xmm7
912         movaps  -0xb8(%rax),%xmm8
913         movaps  -0xa8(%rax),%xmm9
914         movaps  -0x98(%rax),%xmm10
915         movaps  -0x88(%rax),%xmm11
916         movaps  -0x78(%rax),%xmm12
917         movaps  -0x68(%rax),%xmm13
918         movaps  -0x58(%rax),%xmm14
919         movaps  -0x48(%rax),%xmm15
920 ___
921 $code.=<<___;
922         mov     -48(%rax),%r15
923 .cfi_restore    %r15
924         mov     -40(%rax),%r14
925 .cfi_restore    %r14
926         mov     -32(%rax),%r13
927 .cfi_restore    %r13
928         mov     -24(%rax),%r12
929 .cfi_restore    %r12
930         mov     -16(%rax),%rbp
931 .cfi_restore    %rbp
932         mov     -8(%rax),%rbx
933 .cfi_restore    %rbx
934         lea     (%rax),%rsp             # restore %rsp
935 .cfi_def_cfa_register   %rsp
936 .Lgcm_enc_abort:
937         mov     $ret,%rax               # return value
938         ret
939 .cfi_endproc
940 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
941 ___
942
943 $code.=<<___;
944 .align  64
945 .Lbswap_mask:
946         .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
947 .Lpoly:
948         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
949 .Lone_msb:
950         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
951 .Ltwo_lsb:
952         .byte   2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
953 .Lone_lsb:
954         .byte   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
955 .asciz  "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
956 .align  64
957 ___
958 if ($win64) {
959 $rec="%rcx";
960 $frame="%rdx";
961 $context="%r8";
962 $disp="%r9";
963
964 $code.=<<___
965 .extern __imp_RtlVirtualUnwind
966 .type   gcm_se_handler,\@abi-omnipotent
967 .align  16
968 gcm_se_handler:
969         push    %rsi
970         push    %rdi
971         push    %rbx
972         push    %rbp
973         push    %r12
974         push    %r13
975         push    %r14
976         push    %r15
977         pushfq
978         sub     \$64,%rsp
979
980         mov     120($context),%rax      # pull context->Rax
981         mov     248($context),%rbx      # pull context->Rip
982
983         mov     8($disp),%rsi           # disp->ImageBase
984         mov     56($disp),%r11          # disp->HandlerData
985
986         mov     0(%r11),%r10d           # HandlerData[0]
987         lea     (%rsi,%r10),%r10        # prologue label
988         cmp     %r10,%rbx               # context->Rip<prologue label
989         jb      .Lcommon_seh_tail
990
991         mov     152($context),%rax      # pull context->Rsp
992
993         mov     4(%r11),%r10d           # HandlerData[1]
994         lea     (%rsi,%r10),%r10        # epilogue label
995         cmp     %r10,%rbx               # context->Rip>=epilogue label
996         jae     .Lcommon_seh_tail
997
998         mov     120($context),%rax      # pull context->Rax
999
1000         mov     -48(%rax),%r15
1001         mov     -40(%rax),%r14
1002         mov     -32(%rax),%r13
1003         mov     -24(%rax),%r12
1004         mov     -16(%rax),%rbp
1005         mov     -8(%rax),%rbx
1006         mov     %r15,240($context)
1007         mov     %r14,232($context)
1008         mov     %r13,224($context)
1009         mov     %r12,216($context)
1010         mov     %rbp,160($context)
1011         mov     %rbx,144($context)
1012
1013         lea     -0xd8(%rax),%rsi        # %xmm save area
1014         lea     512($context),%rdi      # & context.Xmm6
1015         mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
1016         .long   0xa548f3fc              # cld; rep movsq
1017
1018 .Lcommon_seh_tail:
1019         mov     8(%rax),%rdi
1020         mov     16(%rax),%rsi
1021         mov     %rax,152($context)      # restore context->Rsp
1022         mov     %rsi,168($context)      # restore context->Rsi
1023         mov     %rdi,176($context)      # restore context->Rdi
1024
1025         mov     40($disp),%rdi          # disp->ContextRecord
1026         mov     $context,%rsi           # context
1027         mov     \$154,%ecx              # sizeof(CONTEXT)
1028         .long   0xa548f3fc              # cld; rep movsq
1029
1030         mov     $disp,%rsi
1031         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1032         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1033         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1034         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1035         mov     40(%rsi),%r10           # disp->ContextRecord
1036         lea     56(%rsi),%r11           # &disp->HandlerData
1037         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1038         mov     %r10,32(%rsp)           # arg5
1039         mov     %r11,40(%rsp)           # arg6
1040         mov     %r12,48(%rsp)           # arg7
1041         mov     %rcx,56(%rsp)           # arg8, (NULL)
1042         call    *__imp_RtlVirtualUnwind(%rip)
1043
1044         mov     \$1,%eax                # ExceptionContinueSearch
1045         add     \$64,%rsp
1046         popfq
1047         pop     %r15
1048         pop     %r14
1049         pop     %r13
1050         pop     %r12
1051         pop     %rbp
1052         pop     %rbx
1053         pop     %rdi
1054         pop     %rsi
1055         ret
1056 .size   gcm_se_handler,.-gcm_se_handler
1057
1058 .section        .pdata
1059 .align  4
1060         .rva    .LSEH_begin_aesni_gcm_decrypt
1061         .rva    .LSEH_end_aesni_gcm_decrypt
1062         .rva    .LSEH_gcm_dec_info
1063
1064         .rva    .LSEH_begin_aesni_gcm_encrypt
1065         .rva    .LSEH_end_aesni_gcm_encrypt
1066         .rva    .LSEH_gcm_enc_info
1067 .section        .xdata
1068 .align  8
1069 .LSEH_gcm_dec_info:
1070         .byte   9,0,0,0
1071         .rva    gcm_se_handler
1072         .rva    .Lgcm_dec_body,.Lgcm_dec_abort
1073 .LSEH_gcm_enc_info:
1074         .byte   9,0,0,0
1075         .rva    gcm_se_handler
1076         .rva    .Lgcm_enc_body,.Lgcm_enc_abort
1077 ___
1078 }
1079 }}} else {{{
1080 $code=<<___;    # assembler is too old
1081 .text
1082
1083 .globl  aesni_gcm_encrypt
1084 .type   aesni_gcm_encrypt,\@abi-omnipotent
1085 aesni_gcm_encrypt:
1086 .cfi_startproc
1087         xor     %eax,%eax
1088         ret
1089 .cfi_endproc
1090 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
1091
1092 .globl  aesni_gcm_decrypt
1093 .type   aesni_gcm_decrypt,\@abi-omnipotent
1094 aesni_gcm_decrypt:
1095 .cfi_startproc
1096         xor     %eax,%eax
1097         ret
1098 .cfi_endproc
1099 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
1100 ___
1101 }}}
1102
1103 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1104
1105 print $code;
1106
1107 close STDOUT;