Unify all assembler file generators
[oweals/openssl.git] / crypto / modes / asm / aesni-gcm-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 #
18 # AES-NI-CTR+GHASH stitch.
19 #
20 # February 2013
21 #
22 # OpenSSL GCM implementation is organized in such way that its
23 # performance is rather close to the sum of its streamed components,
24 # in the context parallelized AES-NI CTR and modulo-scheduled
25 # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
26 # was observed to perform significantly better than the sum of the
27 # components on contemporary CPUs, the effort was deemed impossible to
28 # justify. This module is based on combination of Intel submissions,
29 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
30 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
31 # pressure with notable relative improvement, achieving 1.0 cycle per
32 # byte processed with 128-bit key on Haswell processor, 0.74 - on
33 # Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
34 # measurements for favourable packet size, one divisible by 96.
35 # Applications using the EVP interface will observe a few percent
36 # worse performance.]
37 #
38 # Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
39 #
40 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
41 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
42
43 # $output is the last argument if it looks like a file (it has an extension)
44 # $flavour is the first argument if it doesn't look like a file
45 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
47
48 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
49
50 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
51 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
52 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
53 die "can't locate x86_64-xlate.pl";
54
55 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
56                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
57         $avx = ($1>=2.20) + ($1>=2.22);
58 }
59
60 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
61             `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
62         $avx = ($1>=2.09) + ($1>=2.10);
63 }
64
65 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
66             `ml64 2>&1` =~ /Version ([0-9]+)\./) {
67         $avx = ($1>=10) + ($1>=11);
68 }
69
70 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
71         $avx = ($2>=3.0) + ($2>3.0);
72 }
73
74 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
75     or die "can't call $xlate: $!";
76 *STDOUT=*OUT;
77
78 if ($avx>1) {{{
79
80 ($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
81
82 ($Ii,$T1,$T2,$Hkey,
83  $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
84
85 ($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
86
87 ($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
88
89 $code=<<___;
90 .text
91
92 .type   _aesni_ctr32_ghash_6x,\@abi-omnipotent
93 .align  32
94 _aesni_ctr32_ghash_6x:
95         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
96         sub             \$6,$len
97         vpxor           $Z0,$Z0,$Z0             # $Z0   = 0
98         vmovdqu         0x00-0x80($key),$rndkey
99         vpaddb          $T2,$T1,$inout1
100         vpaddb          $T2,$inout1,$inout2
101         vpaddb          $T2,$inout2,$inout3
102         vpaddb          $T2,$inout3,$inout4
103         vpaddb          $T2,$inout4,$inout5
104         vpxor           $rndkey,$T1,$inout0
105         vmovdqu         $Z0,16+8(%rsp)          # "$Z3" = 0
106         jmp             .Loop6x
107
108 .align  32
109 .Loop6x:
110         add             \$`6<<24`,$counter
111         jc              .Lhandle_ctr32          # discard $inout[1-5]?
112         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
113           vpaddb        $T2,$inout5,$T1         # next counter value
114           vpxor         $rndkey,$inout1,$inout1
115           vpxor         $rndkey,$inout2,$inout2
116
117 .Lresume_ctr32:
118         vmovdqu         $T1,($ivp)              # save next counter value
119         vpclmulqdq      \$0x10,$Hkey,$Z3,$Z1
120           vpxor         $rndkey,$inout3,$inout3
121           vmovups       0x10-0x80($key),$T2     # borrow $T2 for $rndkey
122         vpclmulqdq      \$0x01,$Hkey,$Z3,$Z2
123         xor             %r12,%r12
124         cmp             $in0,$end0
125
126           vaesenc       $T2,$inout0,$inout0
127         vmovdqu         0x30+8(%rsp),$Ii        # I[4]
128           vpxor         $rndkey,$inout4,$inout4
129         vpclmulqdq      \$0x00,$Hkey,$Z3,$T1
130           vaesenc       $T2,$inout1,$inout1
131           vpxor         $rndkey,$inout5,$inout5
132         setnc           %r12b
133         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
134           vaesenc       $T2,$inout2,$inout2
135         vmovdqu         0x10-0x20($Xip),$Hkey   # $Hkey^2
136         neg             %r12
137           vaesenc       $T2,$inout3,$inout3
138          vpxor          $Z1,$Z2,$Z2
139         vpclmulqdq      \$0x00,$Hkey,$Ii,$Z1
140          vpxor          $Z0,$Xi,$Xi             # modulo-scheduled
141           vaesenc       $T2,$inout4,$inout4
142          vpxor          $Z1,$T1,$Z0
143         and             \$0x60,%r12
144           vmovups       0x20-0x80($key),$rndkey
145         vpclmulqdq      \$0x10,$Hkey,$Ii,$T1
146           vaesenc       $T2,$inout5,$inout5
147
148         vpclmulqdq      \$0x01,$Hkey,$Ii,$T2
149         lea             ($in0,%r12),$in0
150           vaesenc       $rndkey,$inout0,$inout0
151          vpxor          16+8(%rsp),$Xi,$Xi      # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
152         vpclmulqdq      \$0x11,$Hkey,$Ii,$Hkey
153          vmovdqu        0x40+8(%rsp),$Ii        # I[3]
154           vaesenc       $rndkey,$inout1,$inout1
155         movbe           0x58($in0),%r13
156           vaesenc       $rndkey,$inout2,$inout2
157         movbe           0x50($in0),%r12
158           vaesenc       $rndkey,$inout3,$inout3
159         mov             %r13,0x20+8(%rsp)
160           vaesenc       $rndkey,$inout4,$inout4
161         mov             %r12,0x28+8(%rsp)
162         vmovdqu         0x30-0x20($Xip),$Z1     # borrow $Z1 for $Hkey^3
163           vaesenc       $rndkey,$inout5,$inout5
164
165           vmovups       0x30-0x80($key),$rndkey
166          vpxor          $T1,$Z2,$Z2
167         vpclmulqdq      \$0x00,$Z1,$Ii,$T1
168           vaesenc       $rndkey,$inout0,$inout0
169          vpxor          $T2,$Z2,$Z2
170         vpclmulqdq      \$0x10,$Z1,$Ii,$T2
171           vaesenc       $rndkey,$inout1,$inout1
172          vpxor          $Hkey,$Z3,$Z3
173         vpclmulqdq      \$0x01,$Z1,$Ii,$Hkey
174           vaesenc       $rndkey,$inout2,$inout2
175         vpclmulqdq      \$0x11,$Z1,$Ii,$Z1
176          vmovdqu        0x50+8(%rsp),$Ii        # I[2]
177           vaesenc       $rndkey,$inout3,$inout3
178           vaesenc       $rndkey,$inout4,$inout4
179          vpxor          $T1,$Z0,$Z0
180         vmovdqu         0x40-0x20($Xip),$T1     # borrow $T1 for $Hkey^4
181           vaesenc       $rndkey,$inout5,$inout5
182
183           vmovups       0x40-0x80($key),$rndkey
184          vpxor          $T2,$Z2,$Z2
185         vpclmulqdq      \$0x00,$T1,$Ii,$T2
186           vaesenc       $rndkey,$inout0,$inout0
187          vpxor          $Hkey,$Z2,$Z2
188         vpclmulqdq      \$0x10,$T1,$Ii,$Hkey
189           vaesenc       $rndkey,$inout1,$inout1
190         movbe           0x48($in0),%r13
191          vpxor          $Z1,$Z3,$Z3
192         vpclmulqdq      \$0x01,$T1,$Ii,$Z1
193           vaesenc       $rndkey,$inout2,$inout2
194         movbe           0x40($in0),%r12
195         vpclmulqdq      \$0x11,$T1,$Ii,$T1
196          vmovdqu        0x60+8(%rsp),$Ii        # I[1]
197           vaesenc       $rndkey,$inout3,$inout3
198         mov             %r13,0x30+8(%rsp)
199           vaesenc       $rndkey,$inout4,$inout4
200         mov             %r12,0x38+8(%rsp)
201          vpxor          $T2,$Z0,$Z0
202         vmovdqu         0x60-0x20($Xip),$T2     # borrow $T2 for $Hkey^5
203           vaesenc       $rndkey,$inout5,$inout5
204
205           vmovups       0x50-0x80($key),$rndkey
206          vpxor          $Hkey,$Z2,$Z2
207         vpclmulqdq      \$0x00,$T2,$Ii,$Hkey
208           vaesenc       $rndkey,$inout0,$inout0
209          vpxor          $Z1,$Z2,$Z2
210         vpclmulqdq      \$0x10,$T2,$Ii,$Z1
211           vaesenc       $rndkey,$inout1,$inout1
212         movbe           0x38($in0),%r13
213          vpxor          $T1,$Z3,$Z3
214         vpclmulqdq      \$0x01,$T2,$Ii,$T1
215          vpxor          0x70+8(%rsp),$Xi,$Xi    # accumulate I[0]
216           vaesenc       $rndkey,$inout2,$inout2
217         movbe           0x30($in0),%r12
218         vpclmulqdq      \$0x11,$T2,$Ii,$T2
219           vaesenc       $rndkey,$inout3,$inout3
220         mov             %r13,0x40+8(%rsp)
221           vaesenc       $rndkey,$inout4,$inout4
222         mov             %r12,0x48+8(%rsp)
223          vpxor          $Hkey,$Z0,$Z0
224          vmovdqu        0x70-0x20($Xip),$Hkey   # $Hkey^6
225           vaesenc       $rndkey,$inout5,$inout5
226
227           vmovups       0x60-0x80($key),$rndkey
228          vpxor          $Z1,$Z2,$Z2
229         vpclmulqdq      \$0x10,$Hkey,$Xi,$Z1
230           vaesenc       $rndkey,$inout0,$inout0
231          vpxor          $T1,$Z2,$Z2
232         vpclmulqdq      \$0x01,$Hkey,$Xi,$T1
233           vaesenc       $rndkey,$inout1,$inout1
234         movbe           0x28($in0),%r13
235          vpxor          $T2,$Z3,$Z3
236         vpclmulqdq      \$0x00,$Hkey,$Xi,$T2
237           vaesenc       $rndkey,$inout2,$inout2
238         movbe           0x20($in0),%r12
239         vpclmulqdq      \$0x11,$Hkey,$Xi,$Xi
240           vaesenc       $rndkey,$inout3,$inout3
241         mov             %r13,0x50+8(%rsp)
242           vaesenc       $rndkey,$inout4,$inout4
243         mov             %r12,0x58+8(%rsp)
244         vpxor           $Z1,$Z2,$Z2
245           vaesenc       $rndkey,$inout5,$inout5
246         vpxor           $T1,$Z2,$Z2
247
248           vmovups       0x70-0x80($key),$rndkey
249         vpslldq         \$8,$Z2,$Z1
250         vpxor           $T2,$Z0,$Z0
251         vmovdqu         0x10($const),$Hkey      # .Lpoly
252
253           vaesenc       $rndkey,$inout0,$inout0
254         vpxor           $Xi,$Z3,$Z3
255           vaesenc       $rndkey,$inout1,$inout1
256         vpxor           $Z1,$Z0,$Z0
257         movbe           0x18($in0),%r13
258           vaesenc       $rndkey,$inout2,$inout2
259         movbe           0x10($in0),%r12
260         vpalignr        \$8,$Z0,$Z0,$Ii         # 1st phase
261         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
262         mov             %r13,0x60+8(%rsp)
263           vaesenc       $rndkey,$inout3,$inout3
264         mov             %r12,0x68+8(%rsp)
265           vaesenc       $rndkey,$inout4,$inout4
266           vmovups       0x80-0x80($key),$T1     # borrow $T1 for $rndkey
267           vaesenc       $rndkey,$inout5,$inout5
268
269           vaesenc       $T1,$inout0,$inout0
270           vmovups       0x90-0x80($key),$rndkey
271           vaesenc       $T1,$inout1,$inout1
272         vpsrldq         \$8,$Z2,$Z2
273           vaesenc       $T1,$inout2,$inout2
274         vpxor           $Z2,$Z3,$Z3
275           vaesenc       $T1,$inout3,$inout3
276         vpxor           $Ii,$Z0,$Z0
277         movbe           0x08($in0),%r13
278           vaesenc       $T1,$inout4,$inout4
279         movbe           0x00($in0),%r12
280           vaesenc       $T1,$inout5,$inout5
281           vmovups       0xa0-0x80($key),$T1
282           cmp           \$11,$rounds
283           jb            .Lenc_tail              # 128-bit key
284
285           vaesenc       $rndkey,$inout0,$inout0
286           vaesenc       $rndkey,$inout1,$inout1
287           vaesenc       $rndkey,$inout2,$inout2
288           vaesenc       $rndkey,$inout3,$inout3
289           vaesenc       $rndkey,$inout4,$inout4
290           vaesenc       $rndkey,$inout5,$inout5
291
292           vaesenc       $T1,$inout0,$inout0
293           vaesenc       $T1,$inout1,$inout1
294           vaesenc       $T1,$inout2,$inout2
295           vaesenc       $T1,$inout3,$inout3
296           vaesenc       $T1,$inout4,$inout4
297           vmovups       0xb0-0x80($key),$rndkey
298           vaesenc       $T1,$inout5,$inout5
299           vmovups       0xc0-0x80($key),$T1
300           je            .Lenc_tail              # 192-bit key
301
302           vaesenc       $rndkey,$inout0,$inout0
303           vaesenc       $rndkey,$inout1,$inout1
304           vaesenc       $rndkey,$inout2,$inout2
305           vaesenc       $rndkey,$inout3,$inout3
306           vaesenc       $rndkey,$inout4,$inout4
307           vaesenc       $rndkey,$inout5,$inout5
308
309           vaesenc       $T1,$inout0,$inout0
310           vaesenc       $T1,$inout1,$inout1
311           vaesenc       $T1,$inout2,$inout2
312           vaesenc       $T1,$inout3,$inout3
313           vaesenc       $T1,$inout4,$inout4
314           vmovups       0xd0-0x80($key),$rndkey
315           vaesenc       $T1,$inout5,$inout5
316           vmovups       0xe0-0x80($key),$T1
317           jmp           .Lenc_tail              # 256-bit key
318
319 .align  32
320 .Lhandle_ctr32:
321         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
322           vpshufb       $Ii,$T1,$Z2             # byte-swap counter
323           vmovdqu       0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
324           vpaddd        0x40($const),$Z2,$inout1        # .Lone_lsb
325           vpaddd        $Z1,$Z2,$inout2
326         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
327           vpaddd        $Z1,$inout1,$inout3
328           vpshufb       $Ii,$inout1,$inout1
329           vpaddd        $Z1,$inout2,$inout4
330           vpshufb       $Ii,$inout2,$inout2
331           vpxor         $rndkey,$inout1,$inout1
332           vpaddd        $Z1,$inout3,$inout5
333           vpshufb       $Ii,$inout3,$inout3
334           vpxor         $rndkey,$inout2,$inout2
335           vpaddd        $Z1,$inout4,$T1         # byte-swapped next counter value
336           vpshufb       $Ii,$inout4,$inout4
337           vpshufb       $Ii,$inout5,$inout5
338           vpshufb       $Ii,$T1,$T1             # next counter value
339         jmp             .Lresume_ctr32
340
341 .align  32
342 .Lenc_tail:
343           vaesenc       $rndkey,$inout0,$inout0
344         vmovdqu         $Z3,16+8(%rsp)          # postpone vpxor $Z3,$Xi,$Xi
345         vpalignr        \$8,$Z0,$Z0,$Xi         # 2nd phase
346           vaesenc       $rndkey,$inout1,$inout1
347         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
348           vpxor         0x00($inp),$T1,$T2
349           vaesenc       $rndkey,$inout2,$inout2
350           vpxor         0x10($inp),$T1,$Ii
351           vaesenc       $rndkey,$inout3,$inout3
352           vpxor         0x20($inp),$T1,$Z1
353           vaesenc       $rndkey,$inout4,$inout4
354           vpxor         0x30($inp),$T1,$Z2
355           vaesenc       $rndkey,$inout5,$inout5
356           vpxor         0x40($inp),$T1,$Z3
357           vpxor         0x50($inp),$T1,$Hkey
358           vmovdqu       ($ivp),$T1              # load next counter value
359
360           vaesenclast   $T2,$inout0,$inout0
361           vmovdqu       0x20($const),$T2        # borrow $T2, .Lone_msb
362           vaesenclast   $Ii,$inout1,$inout1
363          vpaddb         $T2,$T1,$Ii
364         mov             %r13,0x70+8(%rsp)
365         lea             0x60($inp),$inp
366           vaesenclast   $Z1,$inout2,$inout2
367          vpaddb         $T2,$Ii,$Z1
368         mov             %r12,0x78+8(%rsp)
369         lea             0x60($out),$out
370           vmovdqu       0x00-0x80($key),$rndkey
371           vaesenclast   $Z2,$inout3,$inout3
372          vpaddb         $T2,$Z1,$Z2
373           vaesenclast   $Z3, $inout4,$inout4
374          vpaddb         $T2,$Z2,$Z3
375           vaesenclast   $Hkey,$inout5,$inout5
376          vpaddb         $T2,$Z3,$Hkey
377
378         add             \$0x60,$ret
379         sub             \$0x6,$len
380         jc              .L6x_done
381
382           vmovups       $inout0,-0x60($out)     # save output
383          vpxor          $rndkey,$T1,$inout0
384           vmovups       $inout1,-0x50($out)
385          vmovdqa        $Ii,$inout1             # 0 latency
386           vmovups       $inout2,-0x40($out)
387          vmovdqa        $Z1,$inout2             # 0 latency
388           vmovups       $inout3,-0x30($out)
389          vmovdqa        $Z2,$inout3             # 0 latency
390           vmovups       $inout4,-0x20($out)
391          vmovdqa        $Z3,$inout4             # 0 latency
392           vmovups       $inout5,-0x10($out)
393          vmovdqa        $Hkey,$inout5           # 0 latency
394         vmovdqu         0x20+8(%rsp),$Z3        # I[5]
395         jmp             .Loop6x
396
397 .L6x_done:
398         vpxor           16+8(%rsp),$Xi,$Xi      # modulo-scheduled
399         vpxor           $Z0,$Xi,$Xi             # modulo-scheduled
400
401         ret
402 .size   _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
403 ___
404 ######################################################################
405 #
406 # size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
407 #               const AES_KEY *key, unsigned char iv[16],
408 #               struct { u128 Xi,H,Htbl[9]; } *Xip);
409 $code.=<<___;
410 .globl  aesni_gcm_decrypt
411 .type   aesni_gcm_decrypt,\@function,6
412 .align  32
413 aesni_gcm_decrypt:
414 .cfi_startproc
415         xor     $ret,$ret
416         cmp     \$0x60,$len                     # minimal accepted length
417         jb      .Lgcm_dec_abort
418
419         lea     (%rsp),%rax                     # save stack pointer
420 .cfi_def_cfa_register   %rax
421         push    %rbx
422 .cfi_push       %rbx
423         push    %rbp
424 .cfi_push       %rbp
425         push    %r12
426 .cfi_push       %r12
427         push    %r13
428 .cfi_push       %r13
429         push    %r14
430 .cfi_push       %r14
431         push    %r15
432 .cfi_push       %r15
433 ___
434 $code.=<<___ if ($win64);
435         lea     -0xa8(%rsp),%rsp
436         movaps  %xmm6,-0xd8(%rax)
437         movaps  %xmm7,-0xc8(%rax)
438         movaps  %xmm8,-0xb8(%rax)
439         movaps  %xmm9,-0xa8(%rax)
440         movaps  %xmm10,-0x98(%rax)
441         movaps  %xmm11,-0x88(%rax)
442         movaps  %xmm12,-0x78(%rax)
443         movaps  %xmm13,-0x68(%rax)
444         movaps  %xmm14,-0x58(%rax)
445         movaps  %xmm15,-0x48(%rax)
446 .Lgcm_dec_body:
447 ___
448 $code.=<<___;
449         vzeroupper
450
451         vmovdqu         ($ivp),$T1              # input counter value
452         add             \$-128,%rsp
453         mov             12($ivp),$counter
454         lea             .Lbswap_mask(%rip),$const
455         lea             -0x80($key),$in0        # borrow $in0
456         mov             \$0xf80,$end0           # borrow $end0
457         vmovdqu         ($Xip),$Xi              # load Xi
458         and             \$-128,%rsp             # ensure stack alignment
459         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
460         lea             0x80($key),$key         # size optimization
461         lea             0x20+0x20($Xip),$Xip    # size optimization
462         mov             0xf0-0x80($key),$rounds
463         vpshufb         $Ii,$Xi,$Xi
464
465         and             $end0,$in0
466         and             %rsp,$end0
467         sub             $in0,$end0
468         jc              .Ldec_no_key_aliasing
469         cmp             \$768,$end0
470         jnc             .Ldec_no_key_aliasing
471         sub             $end0,%rsp              # avoid aliasing with key
472 .Ldec_no_key_aliasing:
473
474         vmovdqu         0x50($inp),$Z3          # I[5]
475         lea             ($inp),$in0
476         vmovdqu         0x40($inp),$Z0
477         lea             -0xc0($inp,$len),$end0
478         vmovdqu         0x30($inp),$Z1
479         shr             \$4,$len
480         xor             $ret,$ret
481         vmovdqu         0x20($inp),$Z2
482          vpshufb        $Ii,$Z3,$Z3             # passed to _aesni_ctr32_ghash_6x
483         vmovdqu         0x10($inp),$T2
484          vpshufb        $Ii,$Z0,$Z0
485         vmovdqu         ($inp),$Hkey
486          vpshufb        $Ii,$Z1,$Z1
487         vmovdqu         $Z0,0x30(%rsp)
488          vpshufb        $Ii,$Z2,$Z2
489         vmovdqu         $Z1,0x40(%rsp)
490          vpshufb        $Ii,$T2,$T2
491         vmovdqu         $Z2,0x50(%rsp)
492          vpshufb        $Ii,$Hkey,$Hkey
493         vmovdqu         $T2,0x60(%rsp)
494         vmovdqu         $Hkey,0x70(%rsp)
495
496         call            _aesni_ctr32_ghash_6x
497
498         vmovups         $inout0,-0x60($out)     # save output
499         vmovups         $inout1,-0x50($out)
500         vmovups         $inout2,-0x40($out)
501         vmovups         $inout3,-0x30($out)
502         vmovups         $inout4,-0x20($out)
503         vmovups         $inout5,-0x10($out)
504
505         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
506         vmovdqu         $Xi,-0x40($Xip)         # output Xi
507
508         vzeroupper
509 ___
510 $code.=<<___ if ($win64);
511         movaps  -0xd8(%rax),%xmm6
512         movaps  -0xc8(%rax),%xmm7
513         movaps  -0xb8(%rax),%xmm8
514         movaps  -0xa8(%rax),%xmm9
515         movaps  -0x98(%rax),%xmm10
516         movaps  -0x88(%rax),%xmm11
517         movaps  -0x78(%rax),%xmm12
518         movaps  -0x68(%rax),%xmm13
519         movaps  -0x58(%rax),%xmm14
520         movaps  -0x48(%rax),%xmm15
521 ___
522 $code.=<<___;
523         mov     -48(%rax),%r15
524 .cfi_restore    %r15
525         mov     -40(%rax),%r14
526 .cfi_restore    %r14
527         mov     -32(%rax),%r13
528 .cfi_restore    %r13
529         mov     -24(%rax),%r12
530 .cfi_restore    %r12
531         mov     -16(%rax),%rbp
532 .cfi_restore    %rbp
533         mov     -8(%rax),%rbx
534 .cfi_restore    %rbx
535         lea     (%rax),%rsp             # restore %rsp
536 .cfi_def_cfa_register   %rsp
537 .Lgcm_dec_abort:
538         mov     $ret,%rax               # return value
539         ret
540 .cfi_endproc
541 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
542 ___
543
544 $code.=<<___;
545 .type   _aesni_ctr32_6x,\@abi-omnipotent
546 .align  32
547 _aesni_ctr32_6x:
548         vmovdqu         0x00-0x80($key),$Z0     # borrow $Z0 for $rndkey
549         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
550         lea             -1($rounds),%r13
551         vmovups         0x10-0x80($key),$rndkey
552         lea             0x20-0x80($key),%r12
553         vpxor           $Z0,$T1,$inout0
554         add             \$`6<<24`,$counter
555         jc              .Lhandle_ctr32_2
556         vpaddb          $T2,$T1,$inout1
557         vpaddb          $T2,$inout1,$inout2
558         vpxor           $Z0,$inout1,$inout1
559         vpaddb          $T2,$inout2,$inout3
560         vpxor           $Z0,$inout2,$inout2
561         vpaddb          $T2,$inout3,$inout4
562         vpxor           $Z0,$inout3,$inout3
563         vpaddb          $T2,$inout4,$inout5
564         vpxor           $Z0,$inout4,$inout4
565         vpaddb          $T2,$inout5,$T1
566         vpxor           $Z0,$inout5,$inout5
567         jmp             .Loop_ctr32
568
569 .align  16
570 .Loop_ctr32:
571         vaesenc         $rndkey,$inout0,$inout0
572         vaesenc         $rndkey,$inout1,$inout1
573         vaesenc         $rndkey,$inout2,$inout2
574         vaesenc         $rndkey,$inout3,$inout3
575         vaesenc         $rndkey,$inout4,$inout4
576         vaesenc         $rndkey,$inout5,$inout5
577         vmovups         (%r12),$rndkey
578         lea             0x10(%r12),%r12
579         dec             %r13d
580         jnz             .Loop_ctr32
581
582         vmovdqu         (%r12),$Hkey            # last round key
583         vaesenc         $rndkey,$inout0,$inout0
584         vpxor           0x00($inp),$Hkey,$Z0
585         vaesenc         $rndkey,$inout1,$inout1
586         vpxor           0x10($inp),$Hkey,$Z1
587         vaesenc         $rndkey,$inout2,$inout2
588         vpxor           0x20($inp),$Hkey,$Z2
589         vaesenc         $rndkey,$inout3,$inout3
590         vpxor           0x30($inp),$Hkey,$Xi
591         vaesenc         $rndkey,$inout4,$inout4
592         vpxor           0x40($inp),$Hkey,$T2
593         vaesenc         $rndkey,$inout5,$inout5
594         vpxor           0x50($inp),$Hkey,$Hkey
595         lea             0x60($inp),$inp
596
597         vaesenclast     $Z0,$inout0,$inout0
598         vaesenclast     $Z1,$inout1,$inout1
599         vaesenclast     $Z2,$inout2,$inout2
600         vaesenclast     $Xi,$inout3,$inout3
601         vaesenclast     $T2,$inout4,$inout4
602         vaesenclast     $Hkey,$inout5,$inout5
603         vmovups         $inout0,0x00($out)
604         vmovups         $inout1,0x10($out)
605         vmovups         $inout2,0x20($out)
606         vmovups         $inout3,0x30($out)
607         vmovups         $inout4,0x40($out)
608         vmovups         $inout5,0x50($out)
609         lea             0x60($out),$out
610
611         ret
612 .align  32
613 .Lhandle_ctr32_2:
614         vpshufb         $Ii,$T1,$Z2             # byte-swap counter
615         vmovdqu         0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
616         vpaddd          0x40($const),$Z2,$inout1        # .Lone_lsb
617         vpaddd          $Z1,$Z2,$inout2
618         vpaddd          $Z1,$inout1,$inout3
619         vpshufb         $Ii,$inout1,$inout1
620         vpaddd          $Z1,$inout2,$inout4
621         vpshufb         $Ii,$inout2,$inout2
622         vpxor           $Z0,$inout1,$inout1
623         vpaddd          $Z1,$inout3,$inout5
624         vpshufb         $Ii,$inout3,$inout3
625         vpxor           $Z0,$inout2,$inout2
626         vpaddd          $Z1,$inout4,$T1         # byte-swapped next counter value
627         vpshufb         $Ii,$inout4,$inout4
628         vpxor           $Z0,$inout3,$inout3
629         vpshufb         $Ii,$inout5,$inout5
630         vpxor           $Z0,$inout4,$inout4
631         vpshufb         $Ii,$T1,$T1             # next counter value
632         vpxor           $Z0,$inout5,$inout5
633         jmp     .Loop_ctr32
634 .size   _aesni_ctr32_6x,.-_aesni_ctr32_6x
635
636 .globl  aesni_gcm_encrypt
637 .type   aesni_gcm_encrypt,\@function,6
638 .align  32
639 aesni_gcm_encrypt:
640 .cfi_startproc
641         xor     $ret,$ret
642         cmp     \$0x60*3,$len                   # minimal accepted length
643         jb      .Lgcm_enc_abort
644
645         lea     (%rsp),%rax                     # save stack pointer
646 .cfi_def_cfa_register   %rax
647         push    %rbx
648 .cfi_push       %rbx
649         push    %rbp
650 .cfi_push       %rbp
651         push    %r12
652 .cfi_push       %r12
653         push    %r13
654 .cfi_push       %r13
655         push    %r14
656 .cfi_push       %r14
657         push    %r15
658 .cfi_push       %r15
659 ___
660 $code.=<<___ if ($win64);
661         lea     -0xa8(%rsp),%rsp
662         movaps  %xmm6,-0xd8(%rax)
663         movaps  %xmm7,-0xc8(%rax)
664         movaps  %xmm8,-0xb8(%rax)
665         movaps  %xmm9,-0xa8(%rax)
666         movaps  %xmm10,-0x98(%rax)
667         movaps  %xmm11,-0x88(%rax)
668         movaps  %xmm12,-0x78(%rax)
669         movaps  %xmm13,-0x68(%rax)
670         movaps  %xmm14,-0x58(%rax)
671         movaps  %xmm15,-0x48(%rax)
672 .Lgcm_enc_body:
673 ___
674 $code.=<<___;
675         vzeroupper
676
677         vmovdqu         ($ivp),$T1              # input counter value
678         add             \$-128,%rsp
679         mov             12($ivp),$counter
680         lea             .Lbswap_mask(%rip),$const
681         lea             -0x80($key),$in0        # borrow $in0
682         mov             \$0xf80,$end0           # borrow $end0
683         lea             0x80($key),$key         # size optimization
684         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
685         and             \$-128,%rsp             # ensure stack alignment
686         mov             0xf0-0x80($key),$rounds
687
688         and             $end0,$in0
689         and             %rsp,$end0
690         sub             $in0,$end0
691         jc              .Lenc_no_key_aliasing
692         cmp             \$768,$end0
693         jnc             .Lenc_no_key_aliasing
694         sub             $end0,%rsp              # avoid aliasing with key
695 .Lenc_no_key_aliasing:
696
697         lea             ($out),$in0
698         lea             -0xc0($out,$len),$end0
699         shr             \$4,$len
700
701         call            _aesni_ctr32_6x
702         vpshufb         $Ii,$inout0,$Xi         # save bswapped output on stack
703         vpshufb         $Ii,$inout1,$T2
704         vmovdqu         $Xi,0x70(%rsp)
705         vpshufb         $Ii,$inout2,$Z0
706         vmovdqu         $T2,0x60(%rsp)
707         vpshufb         $Ii,$inout3,$Z1
708         vmovdqu         $Z0,0x50(%rsp)
709         vpshufb         $Ii,$inout4,$Z2
710         vmovdqu         $Z1,0x40(%rsp)
711         vpshufb         $Ii,$inout5,$Z3         # passed to _aesni_ctr32_ghash_6x
712         vmovdqu         $Z2,0x30(%rsp)
713
714         call            _aesni_ctr32_6x
715
716         vmovdqu         ($Xip),$Xi              # load Xi
717         lea             0x20+0x20($Xip),$Xip    # size optimization
718         sub             \$12,$len
719         mov             \$0x60*2,$ret
720         vpshufb         $Ii,$Xi,$Xi
721
722         call            _aesni_ctr32_ghash_6x
723         vmovdqu         0x20(%rsp),$Z3          # I[5]
724          vmovdqu        ($const),$Ii            # borrow $Ii for .Lbswap_mask
725         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
726         vpunpckhqdq     $Z3,$Z3,$T1
727         vmovdqu         0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
728          vmovups        $inout0,-0x60($out)     # save output
729          vpshufb        $Ii,$inout0,$inout0     # but keep bswapped copy
730         vpxor           $Z3,$T1,$T1
731          vmovups        $inout1,-0x50($out)
732          vpshufb        $Ii,$inout1,$inout1
733          vmovups        $inout2,-0x40($out)
734          vpshufb        $Ii,$inout2,$inout2
735          vmovups        $inout3,-0x30($out)
736          vpshufb        $Ii,$inout3,$inout3
737          vmovups        $inout4,-0x20($out)
738          vpshufb        $Ii,$inout4,$inout4
739          vmovups        $inout5,-0x10($out)
740          vpshufb        $Ii,$inout5,$inout5
741          vmovdqu        $inout0,0x10(%rsp)      # free $inout0
742 ___
743 { my ($HK,$T3)=($rndkey,$inout0);
744
745 $code.=<<___;
746          vmovdqu        0x30(%rsp),$Z2          # I[4]
747          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
748          vpunpckhqdq    $Z2,$Z2,$T2
749         vpclmulqdq      \$0x00,$Hkey,$Z3,$Z1
750          vpxor          $Z2,$T2,$T2
751         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
752         vpclmulqdq      \$0x00,$HK,$T1,$T1
753
754          vmovdqu        0x40(%rsp),$T3          # I[3]
755         vpclmulqdq      \$0x00,$Ii,$Z2,$Z0
756          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
757         vpxor           $Z1,$Z0,$Z0
758          vpunpckhqdq    $T3,$T3,$Z1
759         vpclmulqdq      \$0x11,$Ii,$Z2,$Z2
760          vpxor          $T3,$Z1,$Z1
761         vpxor           $Z3,$Z2,$Z2
762         vpclmulqdq      \$0x10,$HK,$T2,$T2
763          vmovdqu        0x50-0x20($Xip),$HK
764         vpxor           $T1,$T2,$T2
765
766          vmovdqu        0x50(%rsp),$T1          # I[2]
767         vpclmulqdq      \$0x00,$Hkey,$T3,$Z3
768          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
769         vpxor           $Z0,$Z3,$Z3
770          vpunpckhqdq    $T1,$T1,$Z0
771         vpclmulqdq      \$0x11,$Hkey,$T3,$T3
772          vpxor          $T1,$Z0,$Z0
773         vpxor           $Z2,$T3,$T3
774         vpclmulqdq      \$0x00,$HK,$Z1,$Z1
775         vpxor           $T2,$Z1,$Z1
776
777          vmovdqu        0x60(%rsp),$T2          # I[1]
778         vpclmulqdq      \$0x00,$Ii,$T1,$Z2
779          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
780         vpxor           $Z3,$Z2,$Z2
781          vpunpckhqdq    $T2,$T2,$Z3
782         vpclmulqdq      \$0x11,$Ii,$T1,$T1
783          vpxor          $T2,$Z3,$Z3
784         vpxor           $T3,$T1,$T1
785         vpclmulqdq      \$0x10,$HK,$Z0,$Z0
786          vmovdqu        0x80-0x20($Xip),$HK
787         vpxor           $Z1,$Z0,$Z0
788
789          vpxor          0x70(%rsp),$Xi,$Xi      # accumulate I[0]
790         vpclmulqdq      \$0x00,$Hkey,$T2,$Z1
791          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
792          vpunpckhqdq    $Xi,$Xi,$T3
793         vpxor           $Z2,$Z1,$Z1
794         vpclmulqdq      \$0x11,$Hkey,$T2,$T2
795          vpxor          $Xi,$T3,$T3
796         vpxor           $T1,$T2,$T2
797         vpclmulqdq      \$0x00,$HK,$Z3,$Z3
798         vpxor           $Z0,$Z3,$Z0
799
800         vpclmulqdq      \$0x00,$Ii,$Xi,$Z2
801          vmovdqu        0x00-0x20($Xip),$Hkey   # $Hkey^1
802          vpunpckhqdq    $inout5,$inout5,$T1
803         vpclmulqdq      \$0x11,$Ii,$Xi,$Xi
804          vpxor          $inout5,$T1,$T1
805         vpxor           $Z1,$Z2,$Z1
806         vpclmulqdq      \$0x10,$HK,$T3,$T3
807          vmovdqu        0x20-0x20($Xip),$HK
808         vpxor           $T2,$Xi,$Z3
809         vpxor           $Z0,$T3,$Z2
810
811          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
812           vpxor         $Z1,$Z3,$T3             # aggregated Karatsuba post-processing
813         vpclmulqdq      \$0x00,$Hkey,$inout5,$Z0
814           vpxor         $T3,$Z2,$Z2
815          vpunpckhqdq    $inout4,$inout4,$T2
816         vpclmulqdq      \$0x11,$Hkey,$inout5,$inout5
817          vpxor          $inout4,$T2,$T2
818           vpslldq       \$8,$Z2,$T3
819         vpclmulqdq      \$0x00,$HK,$T1,$T1
820           vpxor         $T3,$Z1,$Xi
821           vpsrldq       \$8,$Z2,$Z2
822           vpxor         $Z2,$Z3,$Z3
823
824         vpclmulqdq      \$0x00,$Ii,$inout4,$Z1
825          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
826         vpxor           $Z0,$Z1,$Z1
827          vpunpckhqdq    $inout3,$inout3,$T3
828         vpclmulqdq      \$0x11,$Ii,$inout4,$inout4
829          vpxor          $inout3,$T3,$T3
830         vpxor           $inout5,$inout4,$inout4
831           vpalignr      \$8,$Xi,$Xi,$inout5     # 1st phase
832         vpclmulqdq      \$0x10,$HK,$T2,$T2
833          vmovdqu        0x50-0x20($Xip),$HK
834         vpxor           $T1,$T2,$T2
835
836         vpclmulqdq      \$0x00,$Hkey,$inout3,$Z0
837          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
838         vpxor           $Z1,$Z0,$Z0
839          vpunpckhqdq    $inout2,$inout2,$T1
840         vpclmulqdq      \$0x11,$Hkey,$inout3,$inout3
841          vpxor          $inout2,$T1,$T1
842         vpxor           $inout4,$inout3,$inout3
843           vxorps        0x10(%rsp),$Z3,$Z3      # accumulate $inout0
844         vpclmulqdq      \$0x00,$HK,$T3,$T3
845         vpxor           $T2,$T3,$T3
846
847           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
848           vxorps        $inout5,$Xi,$Xi
849
850         vpclmulqdq      \$0x00,$Ii,$inout2,$Z1
851          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
852         vpxor           $Z0,$Z1,$Z1
853          vpunpckhqdq    $inout1,$inout1,$T2
854         vpclmulqdq      \$0x11,$Ii,$inout2,$inout2
855          vpxor          $inout1,$T2,$T2
856           vpalignr      \$8,$Xi,$Xi,$inout5     # 2nd phase
857         vpxor           $inout3,$inout2,$inout2
858         vpclmulqdq      \$0x10,$HK,$T1,$T1
859          vmovdqu        0x80-0x20($Xip),$HK
860         vpxor           $T3,$T1,$T1
861
862           vxorps        $Z3,$inout5,$inout5
863           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
864           vxorps        $inout5,$Xi,$Xi
865
866         vpclmulqdq      \$0x00,$Hkey,$inout1,$Z0
867          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
868         vpxor           $Z1,$Z0,$Z0
869          vpunpckhqdq    $Xi,$Xi,$T3
870         vpclmulqdq      \$0x11,$Hkey,$inout1,$inout1
871          vpxor          $Xi,$T3,$T3
872         vpxor           $inout2,$inout1,$inout1
873         vpclmulqdq      \$0x00,$HK,$T2,$T2
874         vpxor           $T1,$T2,$T2
875
876         vpclmulqdq      \$0x00,$Ii,$Xi,$Z1
877         vpclmulqdq      \$0x11,$Ii,$Xi,$Z3
878         vpxor           $Z0,$Z1,$Z1
879         vpclmulqdq      \$0x10,$HK,$T3,$Z2
880         vpxor           $inout1,$Z3,$Z3
881         vpxor           $T2,$Z2,$Z2
882
883         vpxor           $Z1,$Z3,$Z0             # aggregated Karatsuba post-processing
884         vpxor           $Z0,$Z2,$Z2
885         vpslldq         \$8,$Z2,$T1
886         vmovdqu         0x10($const),$Hkey      # .Lpoly
887         vpsrldq         \$8,$Z2,$Z2
888         vpxor           $T1,$Z1,$Xi
889         vpxor           $Z2,$Z3,$Z3
890
891         vpalignr        \$8,$Xi,$Xi,$T2         # 1st phase
892         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
893         vpxor           $T2,$Xi,$Xi
894
895         vpalignr        \$8,$Xi,$Xi,$T2         # 2nd phase
896         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
897         vpxor           $Z3,$T2,$T2
898         vpxor           $T2,$Xi,$Xi
899 ___
900 }
901 $code.=<<___;
902         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
903         vmovdqu         $Xi,-0x40($Xip)         # output Xi
904
905         vzeroupper
906 ___
907 $code.=<<___ if ($win64);
908         movaps  -0xd8(%rax),%xmm6
909         movaps  -0xc8(%rax),%xmm7
910         movaps  -0xb8(%rax),%xmm8
911         movaps  -0xa8(%rax),%xmm9
912         movaps  -0x98(%rax),%xmm10
913         movaps  -0x88(%rax),%xmm11
914         movaps  -0x78(%rax),%xmm12
915         movaps  -0x68(%rax),%xmm13
916         movaps  -0x58(%rax),%xmm14
917         movaps  -0x48(%rax),%xmm15
918 ___
919 $code.=<<___;
920         mov     -48(%rax),%r15
921 .cfi_restore    %r15
922         mov     -40(%rax),%r14
923 .cfi_restore    %r14
924         mov     -32(%rax),%r13
925 .cfi_restore    %r13
926         mov     -24(%rax),%r12
927 .cfi_restore    %r12
928         mov     -16(%rax),%rbp
929 .cfi_restore    %rbp
930         mov     -8(%rax),%rbx
931 .cfi_restore    %rbx
932         lea     (%rax),%rsp             # restore %rsp
933 .cfi_def_cfa_register   %rsp
934 .Lgcm_enc_abort:
935         mov     $ret,%rax               # return value
936         ret
937 .cfi_endproc
938 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
939 ___
940
941 $code.=<<___;
942 .align  64
943 .Lbswap_mask:
944         .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
945 .Lpoly:
946         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
947 .Lone_msb:
948         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
949 .Ltwo_lsb:
950         .byte   2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
951 .Lone_lsb:
952         .byte   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
953 .asciz  "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
954 .align  64
955 ___
956 if ($win64) {
957 $rec="%rcx";
958 $frame="%rdx";
959 $context="%r8";
960 $disp="%r9";
961
962 $code.=<<___
963 .extern __imp_RtlVirtualUnwind
964 .type   gcm_se_handler,\@abi-omnipotent
965 .align  16
966 gcm_se_handler:
967         push    %rsi
968         push    %rdi
969         push    %rbx
970         push    %rbp
971         push    %r12
972         push    %r13
973         push    %r14
974         push    %r15
975         pushfq
976         sub     \$64,%rsp
977
978         mov     120($context),%rax      # pull context->Rax
979         mov     248($context),%rbx      # pull context->Rip
980
981         mov     8($disp),%rsi           # disp->ImageBase
982         mov     56($disp),%r11          # disp->HandlerData
983
984         mov     0(%r11),%r10d           # HandlerData[0]
985         lea     (%rsi,%r10),%r10        # prologue label
986         cmp     %r10,%rbx               # context->Rip<prologue label
987         jb      .Lcommon_seh_tail
988
989         mov     152($context),%rax      # pull context->Rsp
990
991         mov     4(%r11),%r10d           # HandlerData[1]
992         lea     (%rsi,%r10),%r10        # epilogue label
993         cmp     %r10,%rbx               # context->Rip>=epilogue label
994         jae     .Lcommon_seh_tail
995
996         mov     120($context),%rax      # pull context->Rax
997
998         mov     -48(%rax),%r15
999         mov     -40(%rax),%r14
1000         mov     -32(%rax),%r13
1001         mov     -24(%rax),%r12
1002         mov     -16(%rax),%rbp
1003         mov     -8(%rax),%rbx
1004         mov     %r15,240($context)
1005         mov     %r14,232($context)
1006         mov     %r13,224($context)
1007         mov     %r12,216($context)
1008         mov     %rbp,160($context)
1009         mov     %rbx,144($context)
1010
1011         lea     -0xd8(%rax),%rsi        # %xmm save area
1012         lea     512($context),%rdi      # & context.Xmm6
1013         mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
1014         .long   0xa548f3fc              # cld; rep movsq
1015
1016 .Lcommon_seh_tail:
1017         mov     8(%rax),%rdi
1018         mov     16(%rax),%rsi
1019         mov     %rax,152($context)      # restore context->Rsp
1020         mov     %rsi,168($context)      # restore context->Rsi
1021         mov     %rdi,176($context)      # restore context->Rdi
1022
1023         mov     40($disp),%rdi          # disp->ContextRecord
1024         mov     $context,%rsi           # context
1025         mov     \$154,%ecx              # sizeof(CONTEXT)
1026         .long   0xa548f3fc              # cld; rep movsq
1027
1028         mov     $disp,%rsi
1029         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1030         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1031         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1032         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1033         mov     40(%rsi),%r10           # disp->ContextRecord
1034         lea     56(%rsi),%r11           # &disp->HandlerData
1035         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1036         mov     %r10,32(%rsp)           # arg5
1037         mov     %r11,40(%rsp)           # arg6
1038         mov     %r12,48(%rsp)           # arg7
1039         mov     %rcx,56(%rsp)           # arg8, (NULL)
1040         call    *__imp_RtlVirtualUnwind(%rip)
1041
1042         mov     \$1,%eax                # ExceptionContinueSearch
1043         add     \$64,%rsp
1044         popfq
1045         pop     %r15
1046         pop     %r14
1047         pop     %r13
1048         pop     %r12
1049         pop     %rbp
1050         pop     %rbx
1051         pop     %rdi
1052         pop     %rsi
1053         ret
1054 .size   gcm_se_handler,.-gcm_se_handler
1055
1056 .section        .pdata
1057 .align  4
1058         .rva    .LSEH_begin_aesni_gcm_decrypt
1059         .rva    .LSEH_end_aesni_gcm_decrypt
1060         .rva    .LSEH_gcm_dec_info
1061
1062         .rva    .LSEH_begin_aesni_gcm_encrypt
1063         .rva    .LSEH_end_aesni_gcm_encrypt
1064         .rva    .LSEH_gcm_enc_info
1065 .section        .xdata
1066 .align  8
1067 .LSEH_gcm_dec_info:
1068         .byte   9,0,0,0
1069         .rva    gcm_se_handler
1070         .rva    .Lgcm_dec_body,.Lgcm_dec_abort
1071 .LSEH_gcm_enc_info:
1072         .byte   9,0,0,0
1073         .rva    gcm_se_handler
1074         .rva    .Lgcm_enc_body,.Lgcm_enc_abort
1075 ___
1076 }
1077 }}} else {{{
1078 $code=<<___;    # assembler is too old
1079 .text
1080
1081 .globl  aesni_gcm_encrypt
1082 .type   aesni_gcm_encrypt,\@abi-omnipotent
1083 aesni_gcm_encrypt:
1084         xor     %eax,%eax
1085         ret
1086 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
1087
1088 .globl  aesni_gcm_decrypt
1089 .type   aesni_gcm_decrypt,\@abi-omnipotent
1090 aesni_gcm_decrypt:
1091         xor     %eax,%eax
1092         ret
1093 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
1094 ___
1095 }}}
1096
1097 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1098
1099 print $code;
1100
1101 close STDOUT;