2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
18 # AES-NI-CTR+GHASH stitch.
22 # OpenSSL GCM implementation is organized in such way that its
23 # performance is rather close to the sum of its streamed components,
24 # in the context parallelized AES-NI CTR and modulo-scheduled
25 # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
26 # was observed to perform significantly better than the sum of the
27 # components on contemporary CPUs, the effort was deemed impossible to
28 # justify. This module is based on combination of Intel submissions,
29 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
30 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
31 # pressure with notable relative improvement, achieving 1.0 cycle per
32 # byte processed with 128-bit key on Haswell processor, 0.74 - on
33 # Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
34 # measurements for favourable packet size, one divisible by 96.
35 # Applications using the EVP interface will observe a few percent
38 # Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
40 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
41 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
43 # $output is the last argument if it looks like a file (it has an extension)
44 # $flavour is the first argument if it doesn't look like a file
45 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
48 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
50 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
51 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
52 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
53 die "can't locate x86_64-xlate.pl";
55 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
56 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
57 $avx = ($1>=2.20) + ($1>=2.22);
60 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
61 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
62 $avx = ($1>=2.09) + ($1>=2.10);
65 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
66 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
67 $avx = ($1>=10) + ($1>=11);
70 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
71 $avx = ($2>=3.0) + ($2>3.0);
74 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
75 or die "can't call $xlate: $!";
80 ($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
83 $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
85 ($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
87 ($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
92 .type _aesni_ctr32_ghash_6x,\@abi-omnipotent
94 _aesni_ctr32_ghash_6x:
95 vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
97 vpxor $Z0,$Z0,$Z0 # $Z0 = 0
98 vmovdqu 0x00-0x80($key),$rndkey
99 vpaddb $T2,$T1,$inout1
100 vpaddb $T2,$inout1,$inout2
101 vpaddb $T2,$inout2,$inout3
102 vpaddb $T2,$inout3,$inout4
103 vpaddb $T2,$inout4,$inout5
104 vpxor $rndkey,$T1,$inout0
105 vmovdqu $Z0,16+8(%rsp) # "$Z3" = 0
110 add \$`6<<24`,$counter
111 jc .Lhandle_ctr32 # discard $inout[1-5]?
112 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
113 vpaddb $T2,$inout5,$T1 # next counter value
114 vpxor $rndkey,$inout1,$inout1
115 vpxor $rndkey,$inout2,$inout2
118 vmovdqu $T1,($ivp) # save next counter value
119 vpclmulqdq \$0x10,$Hkey,$Z3,$Z1
120 vpxor $rndkey,$inout3,$inout3
121 vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey
122 vpclmulqdq \$0x01,$Hkey,$Z3,$Z2
126 vaesenc $T2,$inout0,$inout0
127 vmovdqu 0x30+8(%rsp),$Ii # I[4]
128 vpxor $rndkey,$inout4,$inout4
129 vpclmulqdq \$0x00,$Hkey,$Z3,$T1
130 vaesenc $T2,$inout1,$inout1
131 vpxor $rndkey,$inout5,$inout5
133 vpclmulqdq \$0x11,$Hkey,$Z3,$Z3
134 vaesenc $T2,$inout2,$inout2
135 vmovdqu 0x10-0x20($Xip),$Hkey # $Hkey^2
137 vaesenc $T2,$inout3,$inout3
139 vpclmulqdq \$0x00,$Hkey,$Ii,$Z1
140 vpxor $Z0,$Xi,$Xi # modulo-scheduled
141 vaesenc $T2,$inout4,$inout4
144 vmovups 0x20-0x80($key),$rndkey
145 vpclmulqdq \$0x10,$Hkey,$Ii,$T1
146 vaesenc $T2,$inout5,$inout5
148 vpclmulqdq \$0x01,$Hkey,$Ii,$T2
150 vaesenc $rndkey,$inout0,$inout0
151 vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
152 vpclmulqdq \$0x11,$Hkey,$Ii,$Hkey
153 vmovdqu 0x40+8(%rsp),$Ii # I[3]
154 vaesenc $rndkey,$inout1,$inout1
155 movbe 0x58($in0),%r13
156 vaesenc $rndkey,$inout2,$inout2
157 movbe 0x50($in0),%r12
158 vaesenc $rndkey,$inout3,$inout3
159 mov %r13,0x20+8(%rsp)
160 vaesenc $rndkey,$inout4,$inout4
161 mov %r12,0x28+8(%rsp)
162 vmovdqu 0x30-0x20($Xip),$Z1 # borrow $Z1 for $Hkey^3
163 vaesenc $rndkey,$inout5,$inout5
165 vmovups 0x30-0x80($key),$rndkey
167 vpclmulqdq \$0x00,$Z1,$Ii,$T1
168 vaesenc $rndkey,$inout0,$inout0
170 vpclmulqdq \$0x10,$Z1,$Ii,$T2
171 vaesenc $rndkey,$inout1,$inout1
173 vpclmulqdq \$0x01,$Z1,$Ii,$Hkey
174 vaesenc $rndkey,$inout2,$inout2
175 vpclmulqdq \$0x11,$Z1,$Ii,$Z1
176 vmovdqu 0x50+8(%rsp),$Ii # I[2]
177 vaesenc $rndkey,$inout3,$inout3
178 vaesenc $rndkey,$inout4,$inout4
180 vmovdqu 0x40-0x20($Xip),$T1 # borrow $T1 for $Hkey^4
181 vaesenc $rndkey,$inout5,$inout5
183 vmovups 0x40-0x80($key),$rndkey
185 vpclmulqdq \$0x00,$T1,$Ii,$T2
186 vaesenc $rndkey,$inout0,$inout0
188 vpclmulqdq \$0x10,$T1,$Ii,$Hkey
189 vaesenc $rndkey,$inout1,$inout1
190 movbe 0x48($in0),%r13
192 vpclmulqdq \$0x01,$T1,$Ii,$Z1
193 vaesenc $rndkey,$inout2,$inout2
194 movbe 0x40($in0),%r12
195 vpclmulqdq \$0x11,$T1,$Ii,$T1
196 vmovdqu 0x60+8(%rsp),$Ii # I[1]
197 vaesenc $rndkey,$inout3,$inout3
198 mov %r13,0x30+8(%rsp)
199 vaesenc $rndkey,$inout4,$inout4
200 mov %r12,0x38+8(%rsp)
202 vmovdqu 0x60-0x20($Xip),$T2 # borrow $T2 for $Hkey^5
203 vaesenc $rndkey,$inout5,$inout5
205 vmovups 0x50-0x80($key),$rndkey
207 vpclmulqdq \$0x00,$T2,$Ii,$Hkey
208 vaesenc $rndkey,$inout0,$inout0
210 vpclmulqdq \$0x10,$T2,$Ii,$Z1
211 vaesenc $rndkey,$inout1,$inout1
212 movbe 0x38($in0),%r13
214 vpclmulqdq \$0x01,$T2,$Ii,$T1
215 vpxor 0x70+8(%rsp),$Xi,$Xi # accumulate I[0]
216 vaesenc $rndkey,$inout2,$inout2
217 movbe 0x30($in0),%r12
218 vpclmulqdq \$0x11,$T2,$Ii,$T2
219 vaesenc $rndkey,$inout3,$inout3
220 mov %r13,0x40+8(%rsp)
221 vaesenc $rndkey,$inout4,$inout4
222 mov %r12,0x48+8(%rsp)
224 vmovdqu 0x70-0x20($Xip),$Hkey # $Hkey^6
225 vaesenc $rndkey,$inout5,$inout5
227 vmovups 0x60-0x80($key),$rndkey
229 vpclmulqdq \$0x10,$Hkey,$Xi,$Z1
230 vaesenc $rndkey,$inout0,$inout0
232 vpclmulqdq \$0x01,$Hkey,$Xi,$T1
233 vaesenc $rndkey,$inout1,$inout1
234 movbe 0x28($in0),%r13
236 vpclmulqdq \$0x00,$Hkey,$Xi,$T2
237 vaesenc $rndkey,$inout2,$inout2
238 movbe 0x20($in0),%r12
239 vpclmulqdq \$0x11,$Hkey,$Xi,$Xi
240 vaesenc $rndkey,$inout3,$inout3
241 mov %r13,0x50+8(%rsp)
242 vaesenc $rndkey,$inout4,$inout4
243 mov %r12,0x58+8(%rsp)
245 vaesenc $rndkey,$inout5,$inout5
248 vmovups 0x70-0x80($key),$rndkey
251 vmovdqu 0x10($const),$Hkey # .Lpoly
253 vaesenc $rndkey,$inout0,$inout0
255 vaesenc $rndkey,$inout1,$inout1
257 movbe 0x18($in0),%r13
258 vaesenc $rndkey,$inout2,$inout2
259 movbe 0x10($in0),%r12
260 vpalignr \$8,$Z0,$Z0,$Ii # 1st phase
261 vpclmulqdq \$0x10,$Hkey,$Z0,$Z0
262 mov %r13,0x60+8(%rsp)
263 vaesenc $rndkey,$inout3,$inout3
264 mov %r12,0x68+8(%rsp)
265 vaesenc $rndkey,$inout4,$inout4
266 vmovups 0x80-0x80($key),$T1 # borrow $T1 for $rndkey
267 vaesenc $rndkey,$inout5,$inout5
269 vaesenc $T1,$inout0,$inout0
270 vmovups 0x90-0x80($key),$rndkey
271 vaesenc $T1,$inout1,$inout1
273 vaesenc $T1,$inout2,$inout2
275 vaesenc $T1,$inout3,$inout3
277 movbe 0x08($in0),%r13
278 vaesenc $T1,$inout4,$inout4
279 movbe 0x00($in0),%r12
280 vaesenc $T1,$inout5,$inout5
281 vmovups 0xa0-0x80($key),$T1
283 jb .Lenc_tail # 128-bit key
285 vaesenc $rndkey,$inout0,$inout0
286 vaesenc $rndkey,$inout1,$inout1
287 vaesenc $rndkey,$inout2,$inout2
288 vaesenc $rndkey,$inout3,$inout3
289 vaesenc $rndkey,$inout4,$inout4
290 vaesenc $rndkey,$inout5,$inout5
292 vaesenc $T1,$inout0,$inout0
293 vaesenc $T1,$inout1,$inout1
294 vaesenc $T1,$inout2,$inout2
295 vaesenc $T1,$inout3,$inout3
296 vaesenc $T1,$inout4,$inout4
297 vmovups 0xb0-0x80($key),$rndkey
298 vaesenc $T1,$inout5,$inout5
299 vmovups 0xc0-0x80($key),$T1
300 je .Lenc_tail # 192-bit key
302 vaesenc $rndkey,$inout0,$inout0
303 vaesenc $rndkey,$inout1,$inout1
304 vaesenc $rndkey,$inout2,$inout2
305 vaesenc $rndkey,$inout3,$inout3
306 vaesenc $rndkey,$inout4,$inout4
307 vaesenc $rndkey,$inout5,$inout5
309 vaesenc $T1,$inout0,$inout0
310 vaesenc $T1,$inout1,$inout1
311 vaesenc $T1,$inout2,$inout2
312 vaesenc $T1,$inout3,$inout3
313 vaesenc $T1,$inout4,$inout4
314 vmovups 0xd0-0x80($key),$rndkey
315 vaesenc $T1,$inout5,$inout5
316 vmovups 0xe0-0x80($key),$T1
317 jmp .Lenc_tail # 256-bit key
321 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
322 vpshufb $Ii,$T1,$Z2 # byte-swap counter
323 vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb
324 vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb
325 vpaddd $Z1,$Z2,$inout2
326 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
327 vpaddd $Z1,$inout1,$inout3
328 vpshufb $Ii,$inout1,$inout1
329 vpaddd $Z1,$inout2,$inout4
330 vpshufb $Ii,$inout2,$inout2
331 vpxor $rndkey,$inout1,$inout1
332 vpaddd $Z1,$inout3,$inout5
333 vpshufb $Ii,$inout3,$inout3
334 vpxor $rndkey,$inout2,$inout2
335 vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value
336 vpshufb $Ii,$inout4,$inout4
337 vpshufb $Ii,$inout5,$inout5
338 vpshufb $Ii,$T1,$T1 # next counter value
343 vaesenc $rndkey,$inout0,$inout0
344 vmovdqu $Z3,16+8(%rsp) # postpone vpxor $Z3,$Xi,$Xi
345 vpalignr \$8,$Z0,$Z0,$Xi # 2nd phase
346 vaesenc $rndkey,$inout1,$inout1
347 vpclmulqdq \$0x10,$Hkey,$Z0,$Z0
348 vpxor 0x00($inp),$T1,$T2
349 vaesenc $rndkey,$inout2,$inout2
350 vpxor 0x10($inp),$T1,$Ii
351 vaesenc $rndkey,$inout3,$inout3
352 vpxor 0x20($inp),$T1,$Z1
353 vaesenc $rndkey,$inout4,$inout4
354 vpxor 0x30($inp),$T1,$Z2
355 vaesenc $rndkey,$inout5,$inout5
356 vpxor 0x40($inp),$T1,$Z3
357 vpxor 0x50($inp),$T1,$Hkey
358 vmovdqu ($ivp),$T1 # load next counter value
360 vaesenclast $T2,$inout0,$inout0
361 vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
362 vaesenclast $Ii,$inout1,$inout1
364 mov %r13,0x70+8(%rsp)
366 vaesenclast $Z1,$inout2,$inout2
368 mov %r12,0x78+8(%rsp)
370 vmovdqu 0x00-0x80($key),$rndkey
371 vaesenclast $Z2,$inout3,$inout3
373 vaesenclast $Z3, $inout4,$inout4
375 vaesenclast $Hkey,$inout5,$inout5
382 vmovups $inout0,-0x60($out) # save output
383 vpxor $rndkey,$T1,$inout0
384 vmovups $inout1,-0x50($out)
385 vmovdqa $Ii,$inout1 # 0 latency
386 vmovups $inout2,-0x40($out)
387 vmovdqa $Z1,$inout2 # 0 latency
388 vmovups $inout3,-0x30($out)
389 vmovdqa $Z2,$inout3 # 0 latency
390 vmovups $inout4,-0x20($out)
391 vmovdqa $Z3,$inout4 # 0 latency
392 vmovups $inout5,-0x10($out)
393 vmovdqa $Hkey,$inout5 # 0 latency
394 vmovdqu 0x20+8(%rsp),$Z3 # I[5]
398 vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled
399 vpxor $Z0,$Xi,$Xi # modulo-scheduled
402 .size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
404 ######################################################################
406 # size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
407 # const AES_KEY *key, unsigned char iv[16],
408 # struct { u128 Xi,H,Htbl[9]; } *Xip);
410 .globl aesni_gcm_decrypt
411 .type aesni_gcm_decrypt,\@function,6
416 cmp \$0x60,$len # minimal accepted length
419 lea (%rsp),%rax # save stack pointer
420 .cfi_def_cfa_register %rax
434 $code.=<<___ if ($win64);
436 movaps %xmm6,-0xd8(%rax)
437 movaps %xmm7,-0xc8(%rax)
438 movaps %xmm8,-0xb8(%rax)
439 movaps %xmm9,-0xa8(%rax)
440 movaps %xmm10,-0x98(%rax)
441 movaps %xmm11,-0x88(%rax)
442 movaps %xmm12,-0x78(%rax)
443 movaps %xmm13,-0x68(%rax)
444 movaps %xmm14,-0x58(%rax)
445 movaps %xmm15,-0x48(%rax)
451 vmovdqu ($ivp),$T1 # input counter value
453 mov 12($ivp),$counter
454 lea .Lbswap_mask(%rip),$const
455 lea -0x80($key),$in0 # borrow $in0
456 mov \$0xf80,$end0 # borrow $end0
457 vmovdqu ($Xip),$Xi # load Xi
458 and \$-128,%rsp # ensure stack alignment
459 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
460 lea 0x80($key),$key # size optimization
461 lea 0x20+0x20($Xip),$Xip # size optimization
462 mov 0xf0-0x80($key),$rounds
468 jc .Ldec_no_key_aliasing
470 jnc .Ldec_no_key_aliasing
471 sub $end0,%rsp # avoid aliasing with key
472 .Ldec_no_key_aliasing:
474 vmovdqu 0x50($inp),$Z3 # I[5]
476 vmovdqu 0x40($inp),$Z0
477 lea -0xc0($inp,$len),$end0
478 vmovdqu 0x30($inp),$Z1
481 vmovdqu 0x20($inp),$Z2
482 vpshufb $Ii,$Z3,$Z3 # passed to _aesni_ctr32_ghash_6x
483 vmovdqu 0x10($inp),$T2
487 vmovdqu $Z0,0x30(%rsp)
489 vmovdqu $Z1,0x40(%rsp)
491 vmovdqu $Z2,0x50(%rsp)
492 vpshufb $Ii,$Hkey,$Hkey
493 vmovdqu $T2,0x60(%rsp)
494 vmovdqu $Hkey,0x70(%rsp)
496 call _aesni_ctr32_ghash_6x
498 vmovups $inout0,-0x60($out) # save output
499 vmovups $inout1,-0x50($out)
500 vmovups $inout2,-0x40($out)
501 vmovups $inout3,-0x30($out)
502 vmovups $inout4,-0x20($out)
503 vmovups $inout5,-0x10($out)
505 vpshufb ($const),$Xi,$Xi # .Lbswap_mask
506 vmovdqu $Xi,-0x40($Xip) # output Xi
510 $code.=<<___ if ($win64);
511 movaps -0xd8(%rax),%xmm6
512 movaps -0xc8(%rax),%xmm7
513 movaps -0xb8(%rax),%xmm8
514 movaps -0xa8(%rax),%xmm9
515 movaps -0x98(%rax),%xmm10
516 movaps -0x88(%rax),%xmm11
517 movaps -0x78(%rax),%xmm12
518 movaps -0x68(%rax),%xmm13
519 movaps -0x58(%rax),%xmm14
520 movaps -0x48(%rax),%xmm15
535 lea (%rax),%rsp # restore %rsp
536 .cfi_def_cfa_register %rsp
538 mov $ret,%rax # return value
541 .size aesni_gcm_decrypt,.-aesni_gcm_decrypt
545 .type _aesni_ctr32_6x,\@abi-omnipotent
548 vmovdqu 0x00-0x80($key),$Z0 # borrow $Z0 for $rndkey
549 vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
551 vmovups 0x10-0x80($key),$rndkey
552 lea 0x20-0x80($key),%r12
553 vpxor $Z0,$T1,$inout0
554 add \$`6<<24`,$counter
556 vpaddb $T2,$T1,$inout1
557 vpaddb $T2,$inout1,$inout2
558 vpxor $Z0,$inout1,$inout1
559 vpaddb $T2,$inout2,$inout3
560 vpxor $Z0,$inout2,$inout2
561 vpaddb $T2,$inout3,$inout4
562 vpxor $Z0,$inout3,$inout3
563 vpaddb $T2,$inout4,$inout5
564 vpxor $Z0,$inout4,$inout4
565 vpaddb $T2,$inout5,$T1
566 vpxor $Z0,$inout5,$inout5
571 vaesenc $rndkey,$inout0,$inout0
572 vaesenc $rndkey,$inout1,$inout1
573 vaesenc $rndkey,$inout2,$inout2
574 vaesenc $rndkey,$inout3,$inout3
575 vaesenc $rndkey,$inout4,$inout4
576 vaesenc $rndkey,$inout5,$inout5
577 vmovups (%r12),$rndkey
582 vmovdqu (%r12),$Hkey # last round key
583 vaesenc $rndkey,$inout0,$inout0
584 vpxor 0x00($inp),$Hkey,$Z0
585 vaesenc $rndkey,$inout1,$inout1
586 vpxor 0x10($inp),$Hkey,$Z1
587 vaesenc $rndkey,$inout2,$inout2
588 vpxor 0x20($inp),$Hkey,$Z2
589 vaesenc $rndkey,$inout3,$inout3
590 vpxor 0x30($inp),$Hkey,$Xi
591 vaesenc $rndkey,$inout4,$inout4
592 vpxor 0x40($inp),$Hkey,$T2
593 vaesenc $rndkey,$inout5,$inout5
594 vpxor 0x50($inp),$Hkey,$Hkey
597 vaesenclast $Z0,$inout0,$inout0
598 vaesenclast $Z1,$inout1,$inout1
599 vaesenclast $Z2,$inout2,$inout2
600 vaesenclast $Xi,$inout3,$inout3
601 vaesenclast $T2,$inout4,$inout4
602 vaesenclast $Hkey,$inout5,$inout5
603 vmovups $inout0,0x00($out)
604 vmovups $inout1,0x10($out)
605 vmovups $inout2,0x20($out)
606 vmovups $inout3,0x30($out)
607 vmovups $inout4,0x40($out)
608 vmovups $inout5,0x50($out)
614 vpshufb $Ii,$T1,$Z2 # byte-swap counter
615 vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb
616 vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb
617 vpaddd $Z1,$Z2,$inout2
618 vpaddd $Z1,$inout1,$inout3
619 vpshufb $Ii,$inout1,$inout1
620 vpaddd $Z1,$inout2,$inout4
621 vpshufb $Ii,$inout2,$inout2
622 vpxor $Z0,$inout1,$inout1
623 vpaddd $Z1,$inout3,$inout5
624 vpshufb $Ii,$inout3,$inout3
625 vpxor $Z0,$inout2,$inout2
626 vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value
627 vpshufb $Ii,$inout4,$inout4
628 vpxor $Z0,$inout3,$inout3
629 vpshufb $Ii,$inout5,$inout5
630 vpxor $Z0,$inout4,$inout4
631 vpshufb $Ii,$T1,$T1 # next counter value
632 vpxor $Z0,$inout5,$inout5
634 .size _aesni_ctr32_6x,.-_aesni_ctr32_6x
636 .globl aesni_gcm_encrypt
637 .type aesni_gcm_encrypt,\@function,6
642 cmp \$0x60*3,$len # minimal accepted length
645 lea (%rsp),%rax # save stack pointer
646 .cfi_def_cfa_register %rax
660 $code.=<<___ if ($win64);
662 movaps %xmm6,-0xd8(%rax)
663 movaps %xmm7,-0xc8(%rax)
664 movaps %xmm8,-0xb8(%rax)
665 movaps %xmm9,-0xa8(%rax)
666 movaps %xmm10,-0x98(%rax)
667 movaps %xmm11,-0x88(%rax)
668 movaps %xmm12,-0x78(%rax)
669 movaps %xmm13,-0x68(%rax)
670 movaps %xmm14,-0x58(%rax)
671 movaps %xmm15,-0x48(%rax)
677 vmovdqu ($ivp),$T1 # input counter value
679 mov 12($ivp),$counter
680 lea .Lbswap_mask(%rip),$const
681 lea -0x80($key),$in0 # borrow $in0
682 mov \$0xf80,$end0 # borrow $end0
683 lea 0x80($key),$key # size optimization
684 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
685 and \$-128,%rsp # ensure stack alignment
686 mov 0xf0-0x80($key),$rounds
691 jc .Lenc_no_key_aliasing
693 jnc .Lenc_no_key_aliasing
694 sub $end0,%rsp # avoid aliasing with key
695 .Lenc_no_key_aliasing:
698 lea -0xc0($out,$len),$end0
702 vpshufb $Ii,$inout0,$Xi # save bswapped output on stack
703 vpshufb $Ii,$inout1,$T2
704 vmovdqu $Xi,0x70(%rsp)
705 vpshufb $Ii,$inout2,$Z0
706 vmovdqu $T2,0x60(%rsp)
707 vpshufb $Ii,$inout3,$Z1
708 vmovdqu $Z0,0x50(%rsp)
709 vpshufb $Ii,$inout4,$Z2
710 vmovdqu $Z1,0x40(%rsp)
711 vpshufb $Ii,$inout5,$Z3 # passed to _aesni_ctr32_ghash_6x
712 vmovdqu $Z2,0x30(%rsp)
716 vmovdqu ($Xip),$Xi # load Xi
717 lea 0x20+0x20($Xip),$Xip # size optimization
722 call _aesni_ctr32_ghash_6x
723 vmovdqu 0x20(%rsp),$Z3 # I[5]
724 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
725 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
726 vpunpckhqdq $Z3,$Z3,$T1
727 vmovdqu 0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
728 vmovups $inout0,-0x60($out) # save output
729 vpshufb $Ii,$inout0,$inout0 # but keep bswapped copy
731 vmovups $inout1,-0x50($out)
732 vpshufb $Ii,$inout1,$inout1
733 vmovups $inout2,-0x40($out)
734 vpshufb $Ii,$inout2,$inout2
735 vmovups $inout3,-0x30($out)
736 vpshufb $Ii,$inout3,$inout3
737 vmovups $inout4,-0x20($out)
738 vpshufb $Ii,$inout4,$inout4
739 vmovups $inout5,-0x10($out)
740 vpshufb $Ii,$inout5,$inout5
741 vmovdqu $inout0,0x10(%rsp) # free $inout0
743 { my ($HK,$T3)=($rndkey,$inout0);
746 vmovdqu 0x30(%rsp),$Z2 # I[4]
747 vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
748 vpunpckhqdq $Z2,$Z2,$T2
749 vpclmulqdq \$0x00,$Hkey,$Z3,$Z1
751 vpclmulqdq \$0x11,$Hkey,$Z3,$Z3
752 vpclmulqdq \$0x00,$HK,$T1,$T1
754 vmovdqu 0x40(%rsp),$T3 # I[3]
755 vpclmulqdq \$0x00,$Ii,$Z2,$Z0
756 vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3
758 vpunpckhqdq $T3,$T3,$Z1
759 vpclmulqdq \$0x11,$Ii,$Z2,$Z2
762 vpclmulqdq \$0x10,$HK,$T2,$T2
763 vmovdqu 0x50-0x20($Xip),$HK
766 vmovdqu 0x50(%rsp),$T1 # I[2]
767 vpclmulqdq \$0x00,$Hkey,$T3,$Z3
768 vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
770 vpunpckhqdq $T1,$T1,$Z0
771 vpclmulqdq \$0x11,$Hkey,$T3,$T3
774 vpclmulqdq \$0x00,$HK,$Z1,$Z1
777 vmovdqu 0x60(%rsp),$T2 # I[1]
778 vpclmulqdq \$0x00,$Ii,$T1,$Z2
779 vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5
781 vpunpckhqdq $T2,$T2,$Z3
782 vpclmulqdq \$0x11,$Ii,$T1,$T1
785 vpclmulqdq \$0x10,$HK,$Z0,$Z0
786 vmovdqu 0x80-0x20($Xip),$HK
789 vpxor 0x70(%rsp),$Xi,$Xi # accumulate I[0]
790 vpclmulqdq \$0x00,$Hkey,$T2,$Z1
791 vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
792 vpunpckhqdq $Xi,$Xi,$T3
794 vpclmulqdq \$0x11,$Hkey,$T2,$T2
797 vpclmulqdq \$0x00,$HK,$Z3,$Z3
800 vpclmulqdq \$0x00,$Ii,$Xi,$Z2
801 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
802 vpunpckhqdq $inout5,$inout5,$T1
803 vpclmulqdq \$0x11,$Ii,$Xi,$Xi
804 vpxor $inout5,$T1,$T1
806 vpclmulqdq \$0x10,$HK,$T3,$T3
807 vmovdqu 0x20-0x20($Xip),$HK
811 vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
812 vpxor $Z1,$Z3,$T3 # aggregated Karatsuba post-processing
813 vpclmulqdq \$0x00,$Hkey,$inout5,$Z0
815 vpunpckhqdq $inout4,$inout4,$T2
816 vpclmulqdq \$0x11,$Hkey,$inout5,$inout5
817 vpxor $inout4,$T2,$T2
819 vpclmulqdq \$0x00,$HK,$T1,$T1
824 vpclmulqdq \$0x00,$Ii,$inout4,$Z1
825 vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3
827 vpunpckhqdq $inout3,$inout3,$T3
828 vpclmulqdq \$0x11,$Ii,$inout4,$inout4
829 vpxor $inout3,$T3,$T3
830 vpxor $inout5,$inout4,$inout4
831 vpalignr \$8,$Xi,$Xi,$inout5 # 1st phase
832 vpclmulqdq \$0x10,$HK,$T2,$T2
833 vmovdqu 0x50-0x20($Xip),$HK
836 vpclmulqdq \$0x00,$Hkey,$inout3,$Z0
837 vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
839 vpunpckhqdq $inout2,$inout2,$T1
840 vpclmulqdq \$0x11,$Hkey,$inout3,$inout3
841 vpxor $inout2,$T1,$T1
842 vpxor $inout4,$inout3,$inout3
843 vxorps 0x10(%rsp),$Z3,$Z3 # accumulate $inout0
844 vpclmulqdq \$0x00,$HK,$T3,$T3
847 vpclmulqdq \$0x10,0x10($const),$Xi,$Xi
848 vxorps $inout5,$Xi,$Xi
850 vpclmulqdq \$0x00,$Ii,$inout2,$Z1
851 vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5
853 vpunpckhqdq $inout1,$inout1,$T2
854 vpclmulqdq \$0x11,$Ii,$inout2,$inout2
855 vpxor $inout1,$T2,$T2
856 vpalignr \$8,$Xi,$Xi,$inout5 # 2nd phase
857 vpxor $inout3,$inout2,$inout2
858 vpclmulqdq \$0x10,$HK,$T1,$T1
859 vmovdqu 0x80-0x20($Xip),$HK
862 vxorps $Z3,$inout5,$inout5
863 vpclmulqdq \$0x10,0x10($const),$Xi,$Xi
864 vxorps $inout5,$Xi,$Xi
866 vpclmulqdq \$0x00,$Hkey,$inout1,$Z0
867 vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
869 vpunpckhqdq $Xi,$Xi,$T3
870 vpclmulqdq \$0x11,$Hkey,$inout1,$inout1
872 vpxor $inout2,$inout1,$inout1
873 vpclmulqdq \$0x00,$HK,$T2,$T2
876 vpclmulqdq \$0x00,$Ii,$Xi,$Z1
877 vpclmulqdq \$0x11,$Ii,$Xi,$Z3
879 vpclmulqdq \$0x10,$HK,$T3,$Z2
880 vpxor $inout1,$Z3,$Z3
883 vpxor $Z1,$Z3,$Z0 # aggregated Karatsuba post-processing
886 vmovdqu 0x10($const),$Hkey # .Lpoly
891 vpalignr \$8,$Xi,$Xi,$T2 # 1st phase
892 vpclmulqdq \$0x10,$Hkey,$Xi,$Xi
895 vpalignr \$8,$Xi,$Xi,$T2 # 2nd phase
896 vpclmulqdq \$0x10,$Hkey,$Xi,$Xi
902 vpshufb ($const),$Xi,$Xi # .Lbswap_mask
903 vmovdqu $Xi,-0x40($Xip) # output Xi
907 $code.=<<___ if ($win64);
908 movaps -0xd8(%rax),%xmm6
909 movaps -0xc8(%rax),%xmm7
910 movaps -0xb8(%rax),%xmm8
911 movaps -0xa8(%rax),%xmm9
912 movaps -0x98(%rax),%xmm10
913 movaps -0x88(%rax),%xmm11
914 movaps -0x78(%rax),%xmm12
915 movaps -0x68(%rax),%xmm13
916 movaps -0x58(%rax),%xmm14
917 movaps -0x48(%rax),%xmm15
932 lea (%rax),%rsp # restore %rsp
933 .cfi_def_cfa_register %rsp
935 mov $ret,%rax # return value
938 .size aesni_gcm_encrypt,.-aesni_gcm_encrypt
944 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
946 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
948 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
950 .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
952 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
953 .asciz "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
963 .extern __imp_RtlVirtualUnwind
964 .type gcm_se_handler,\@abi-omnipotent
978 mov 120($context),%rax # pull context->Rax
979 mov 248($context),%rbx # pull context->Rip
981 mov 8($disp),%rsi # disp->ImageBase
982 mov 56($disp),%r11 # disp->HandlerData
984 mov 0(%r11),%r10d # HandlerData[0]
985 lea (%rsi,%r10),%r10 # prologue label
986 cmp %r10,%rbx # context->Rip<prologue label
989 mov 152($context),%rax # pull context->Rsp
991 mov 4(%r11),%r10d # HandlerData[1]
992 lea (%rsi,%r10),%r10 # epilogue label
993 cmp %r10,%rbx # context->Rip>=epilogue label
994 jae .Lcommon_seh_tail
996 mov 120($context),%rax # pull context->Rax
1004 mov %r15,240($context)
1005 mov %r14,232($context)
1006 mov %r13,224($context)
1007 mov %r12,216($context)
1008 mov %rbp,160($context)
1009 mov %rbx,144($context)
1011 lea -0xd8(%rax),%rsi # %xmm save area
1012 lea 512($context),%rdi # & context.Xmm6
1013 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1014 .long 0xa548f3fc # cld; rep movsq
1019 mov %rax,152($context) # restore context->Rsp
1020 mov %rsi,168($context) # restore context->Rsi
1021 mov %rdi,176($context) # restore context->Rdi
1023 mov 40($disp),%rdi # disp->ContextRecord
1024 mov $context,%rsi # context
1025 mov \$154,%ecx # sizeof(CONTEXT)
1026 .long 0xa548f3fc # cld; rep movsq
1029 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1030 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1031 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1032 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1033 mov 40(%rsi),%r10 # disp->ContextRecord
1034 lea 56(%rsi),%r11 # &disp->HandlerData
1035 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1036 mov %r10,32(%rsp) # arg5
1037 mov %r11,40(%rsp) # arg6
1038 mov %r12,48(%rsp) # arg7
1039 mov %rcx,56(%rsp) # arg8, (NULL)
1040 call *__imp_RtlVirtualUnwind(%rip)
1042 mov \$1,%eax # ExceptionContinueSearch
1054 .size gcm_se_handler,.-gcm_se_handler
1058 .rva .LSEH_begin_aesni_gcm_decrypt
1059 .rva .LSEH_end_aesni_gcm_decrypt
1060 .rva .LSEH_gcm_dec_info
1062 .rva .LSEH_begin_aesni_gcm_encrypt
1063 .rva .LSEH_end_aesni_gcm_encrypt
1064 .rva .LSEH_gcm_enc_info
1070 .rva .Lgcm_dec_body,.Lgcm_dec_abort
1074 .rva .Lgcm_enc_body,.Lgcm_enc_abort
1078 $code=<<___; # assembler is too old
1081 .globl aesni_gcm_encrypt
1082 .type aesni_gcm_encrypt,\@abi-omnipotent
1086 .size aesni_gcm_encrypt,.-aesni_gcm_encrypt
1088 .globl aesni_gcm_decrypt
1089 .type aesni_gcm_decrypt,\@abi-omnipotent
1093 .size aesni_gcm_decrypt,.-aesni_gcm_decrypt
1097 $code =~ s/\`([^\`]*)\`/eval($1)/gem;