3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
11 # AES-NI-CTR+GHASH stitch.
15 # OpenSSL GCM implementation is organized in such way that its
16 # performance is rather close to the sum of its streamed components,
17 # in the context parallelized AES-NI CTR and modulo-scheduled
18 # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
19 # was observed to perform significantly better than the sum of the
20 # components on contemporary CPUs, the effort was deemed impossible to
21 # justify. This module is based on combination of Intel submissions,
22 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
23 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
24 # pressure with notable relative improvement on upcoming Haswell
25 # processor. [Exact performance numbers to be added at launch.]
27 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
28 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
32 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
34 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
36 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
37 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
38 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
39 die "can't locate x86_64-xlate.pl";
41 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
42 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
43 $avx = ($1>=2.19) + ($1>=2.22);
46 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
47 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
48 $avx = ($1>=2.09) + ($1>=2.10);
51 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
52 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
53 $avx = ($1>=10) + ($1>=11);
56 open OUT,"| \"$^X\" $xlate $flavour $output";
61 ($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
64 $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
66 ($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
68 ($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
73 .type _aesni_ctr32_ghash_6x,\@abi-omnipotent
75 _aesni_ctr32_ghash_6x:
76 vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
78 vpxor $Z0,$Z0,$Z0 # $Z0 = 0
79 vmovdqu 0x00-0x80($key),$rndkey
80 vpaddb $T2,$T1,$inout1
81 vpaddb $T2,$inout1,$inout2
82 vpaddb $T2,$inout2,$inout3
83 vpaddb $T2,$inout3,$inout4
84 vpaddb $T2,$inout4,$inout5
85 vpxor $rndkey,$T1,$inout0
86 vmovdqu $Z0,16+8(%rsp) # "$Z3" = 0
92 jc .Lhandle_ctr32 # discard $inout[1-5]?
93 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
94 vpaddb $T2,$inout5,$T1 # next counter value
95 vpxor $rndkey,$inout1,$inout1
96 vpxor $rndkey,$inout2,$inout2
99 vmovdqu $T1,($ivp) # save next counter value
100 vpclmulqdq \$0x10,$Hkey,$Z3,$Z1
101 vpxor $rndkey,$inout3,$inout3
102 vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey
103 vpclmulqdq \$0x01,$Hkey,$Z3,$Z2
107 vaesenc $T2,$inout0,$inout0
108 vmovdqu 0x30+8(%rsp),$Ii # I[4]
109 vpxor $rndkey,$inout4,$inout4
110 vpclmulqdq \$0x00,$Hkey,$Z3,$T1
111 vaesenc $T2,$inout1,$inout1
112 vpxor $rndkey,$inout5,$inout5
114 vpclmulqdq \$0x11,$Hkey,$Z3,$Z3
115 vaesenc $T2,$inout2,$inout2
116 vmovdqu 0x10-0x20($Xip),$Hkey # $Hkey^2
118 vaesenc $T2,$inout3,$inout3
120 vpclmulqdq \$0x00,$Hkey,$Ii,$Z1
121 vpxor $Z0,$Xi,$Xi # modulo-scheduled
122 vaesenc $T2,$inout4,$inout4
125 vmovups 0x20-0x80($key),$rndkey
126 vpclmulqdq \$0x10,$Hkey,$Ii,$T1
127 vaesenc $T2,$inout5,$inout5
129 vpclmulqdq \$0x01,$Hkey,$Ii,$T2
131 vaesenc $rndkey,$inout0,$inout0
132 vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
133 vpclmulqdq \$0x11,$Hkey,$Ii,$Hkey
134 vmovdqu 0x40+8(%rsp),$Ii # I[3]
135 vaesenc $rndkey,$inout1,$inout1
136 movbe 0x58($in0),%r13
137 vaesenc $rndkey,$inout2,$inout2
138 movbe 0x50($in0),%r12
139 vaesenc $rndkey,$inout3,$inout3
140 mov %r13,0x20+8(%rsp)
141 vaesenc $rndkey,$inout4,$inout4
142 mov %r12,0x28+8(%rsp)
143 vmovdqu 0x30-0x20($Xip),$Z1 # borrow $Z1 for $Hkey^3
144 vaesenc $rndkey,$inout5,$inout5
146 vmovups 0x30-0x80($key),$rndkey
148 vpclmulqdq \$0x00,$Z1,$Ii,$T1
149 vaesenc $rndkey,$inout0,$inout0
151 vpclmulqdq \$0x10,$Z1,$Ii,$T2
152 vaesenc $rndkey,$inout1,$inout1
154 vpclmulqdq \$0x01,$Z1,$Ii,$Hkey
155 vaesenc $rndkey,$inout2,$inout2
156 vpclmulqdq \$0x11,$Z1,$Ii,$Z1
157 vmovdqu 0x50+8(%rsp),$Ii # I[2]
158 vaesenc $rndkey,$inout3,$inout3
159 vaesenc $rndkey,$inout4,$inout4
161 vmovdqu 0x40-0x20($Xip),$T1 # borrow $T1 for $Hkey^4
162 vaesenc $rndkey,$inout5,$inout5
164 vmovups 0x40-0x80($key),$rndkey
166 vpclmulqdq \$0x00,$T1,$Ii,$T2
167 vaesenc $rndkey,$inout0,$inout0
169 vpclmulqdq \$0x10,$T1,$Ii,$Hkey
170 vaesenc $rndkey,$inout1,$inout1
171 movbe 0x48($in0),%r13
173 vpclmulqdq \$0x01,$T1,$Ii,$Z1
174 vaesenc $rndkey,$inout2,$inout2
175 movbe 0x40($in0),%r12
176 vpclmulqdq \$0x11,$T1,$Ii,$T1
177 vmovdqu 0x60+8(%rsp),$Ii # I[1]
178 vaesenc $rndkey,$inout3,$inout3
179 mov %r13,0x30+8(%rsp)
180 vaesenc $rndkey,$inout4,$inout4
181 mov %r12,0x38+8(%rsp)
183 vmovdqu 0x60-0x20($Xip),$T2 # borrow $T2 for $Hkey^5
184 vaesenc $rndkey,$inout5,$inout5
186 vmovups 0x50-0x80($key),$rndkey
188 vpclmulqdq \$0x00,$T2,$Ii,$Hkey
189 vaesenc $rndkey,$inout0,$inout0
191 vpclmulqdq \$0x10,$T2,$Ii,$Z1
192 vaesenc $rndkey,$inout1,$inout1
193 movbe 0x38($in0),%r13
195 vpclmulqdq \$0x01,$T2,$Ii,$T1
196 vpxor 0x70+8(%rsp),$Xi,$Xi # accumulate I[0]
197 vaesenc $rndkey,$inout2,$inout2
198 movbe 0x30($in0),%r12
199 vpclmulqdq \$0x11,$T2,$Ii,$T2
200 vaesenc $rndkey,$inout3,$inout3
201 mov %r13,0x40+8(%rsp)
202 vaesenc $rndkey,$inout4,$inout4
203 mov %r12,0x48+8(%rsp)
205 vmovdqu 0x70-0x20($Xip),$Hkey # $Hkey^6
206 vaesenc $rndkey,$inout5,$inout5
208 vmovups 0x60-0x80($key),$rndkey
210 vpclmulqdq \$0x10,$Hkey,$Xi,$Z1
211 vaesenc $rndkey,$inout0,$inout0
213 vpclmulqdq \$0x01,$Hkey,$Xi,$T1
214 vaesenc $rndkey,$inout1,$inout1
215 movbe 0x28($in0),%r13
217 vpclmulqdq \$0x00,$Hkey,$Xi,$T2
218 vaesenc $rndkey,$inout2,$inout2
219 movbe 0x20($in0),%r12
220 vpclmulqdq \$0x11,$Hkey,$Xi,$Xi
221 vaesenc $rndkey,$inout3,$inout3
222 mov %r13,0x50+8(%rsp)
223 vaesenc $rndkey,$inout4,$inout4
224 mov %r12,0x58+8(%rsp)
226 vaesenc $rndkey,$inout5,$inout5
229 vmovups 0x70-0x80($key),$rndkey
232 vmovdqu 0x10($const),$Hkey # .Lpoly
234 vaesenc $rndkey,$inout0,$inout0
236 vaesenc $rndkey,$inout1,$inout1
238 movbe 0x18($in0),%r13
239 vaesenc $rndkey,$inout2,$inout2
240 movbe 0x10($in0),%r12
241 vpalignr \$8,$Z0,$Z0,$Ii # 1st phase
242 vpclmulqdq \$0x10,$Hkey,$Z0,$Z0
243 mov %r13,0x60+8(%rsp)
244 vaesenc $rndkey,$inout3,$inout3
245 mov %r12,0x68+8(%rsp)
246 vaesenc $rndkey,$inout4,$inout4
247 vmovups 0x80-0x80($key),$T1 # borrow $T1 for $rndkey
248 vaesenc $rndkey,$inout5,$inout5
250 vaesenc $T1,$inout0,$inout0
251 vmovups 0x90-0x80($key),$rndkey
252 vaesenc $T1,$inout1,$inout1
254 vaesenc $T1,$inout2,$inout2
256 vaesenc $T1,$inout3,$inout3
258 movbe 0x08($in0),%r13
259 vaesenc $T1,$inout4,$inout4
260 movbe 0x00($in0),%r12
261 vaesenc $T1,$inout5,$inout5
262 vmovups 0xa0-0x80($key),$T1
264 jb .Lenc_tail # 128-bit key
266 vaesenc $rndkey,$inout0,$inout0
267 vaesenc $rndkey,$inout1,$inout1
268 vaesenc $rndkey,$inout2,$inout2
269 vaesenc $rndkey,$inout3,$inout3
270 vaesenc $rndkey,$inout4,$inout4
271 vaesenc $rndkey,$inout5,$inout5
273 vaesenc $T1,$inout0,$inout0
274 vaesenc $T1,$inout1,$inout1
275 vaesenc $T1,$inout2,$inout2
276 vaesenc $T1,$inout3,$inout3
277 vaesenc $T1,$inout4,$inout4
278 vmovups 0xb0-0x80($key),$rndkey
279 vaesenc $T1,$inout5,$inout5
280 vmovups 0xc0-0x80($key),$T1
281 je .Lenc_tail # 192-bit key
283 vaesenc $rndkey,$inout0,$inout0
284 vaesenc $rndkey,$inout1,$inout1
285 vaesenc $rndkey,$inout2,$inout2
286 vaesenc $rndkey,$inout3,$inout3
287 vaesenc $rndkey,$inout4,$inout4
288 vaesenc $rndkey,$inout5,$inout5
290 vaesenc $T1,$inout0,$inout0
291 vaesenc $T1,$inout1,$inout1
292 vaesenc $T1,$inout2,$inout2
293 vaesenc $T1,$inout3,$inout3
294 vaesenc $T1,$inout4,$inout4
295 vmovups 0xd0-0x80($key),$rndkey
296 vaesenc $T1,$inout5,$inout5
297 vmovups 0xe0-0x80($key),$T1
298 jmp .Lenc_tail # 256-bit key
302 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
303 vpshufb $Ii,$T1,$Z2 # byte-swap counter
304 vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb
305 vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb
306 vpaddd $Z1,$Z2,$inout2
307 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
308 vpaddd $Z1,$inout1,$inout3
309 vpshufb $Ii,$inout1,$inout1
310 vpaddd $Z1,$inout2,$inout4
311 vpshufb $Ii,$inout2,$inout2
312 vpxor $rndkey,$inout1,$inout1
313 vpaddd $Z1,$inout3,$inout5
314 vpshufb $Ii,$inout3,$inout3
315 vpxor $rndkey,$inout2,$inout2
316 vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value
317 vpshufb $Ii,$inout4,$inout4
318 vpshufb $Ii,$inout5,$inout5
319 vpshufb $Ii,$T1,$T1 # next counter value
324 vaesenc $rndkey,$inout0,$inout0
325 vmovdqu $Z3,16+8(%rsp) # postpone vpxor $Z3,$Xi,$Xi
326 vpalignr \$8,$Z0,$Z0,$Xi # 2nd phase
327 vaesenc $rndkey,$inout1,$inout1
328 vpclmulqdq \$0x10,$Hkey,$Z0,$Z0
329 vpxor 0x00($inp),$T1,$T2
330 vaesenc $rndkey,$inout2,$inout2
331 vpxor 0x10($inp),$T1,$Ii
332 vaesenc $rndkey,$inout3,$inout3
333 vpxor 0x20($inp),$T1,$Z1
334 vaesenc $rndkey,$inout4,$inout4
335 vpxor 0x30($inp),$T1,$Z2
336 vaesenc $rndkey,$inout5,$inout5
337 vpxor 0x40($inp),$T1,$Z3
338 vpxor 0x50($inp),$T1,$Hkey
339 vmovdqu ($ivp),$T1 # load next counter value
341 vaesenclast $T2,$inout0,$inout0
342 vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
343 vaesenclast $Ii,$inout1,$inout1
345 mov %r13,0x70+8(%rsp)
347 vaesenclast $Z1,$inout2,$inout2
349 mov %r12,0x78+8(%rsp)
351 vmovdqu 0x00-0x80($key),$rndkey
352 vaesenclast $Z2,$inout3,$inout3
354 vaesenclast $Z3, $inout4,$inout4
356 vaesenclast $Hkey,$inout5,$inout5
363 vmovups $inout0,-0x60($out) # save output
364 vpxor $rndkey,$T1,$inout0
365 vmovups $inout1,-0x50($out)
366 vmovdqa $Ii,$inout1 # 0 latency
367 vmovups $inout2,-0x40($out)
368 vmovdqa $Z1,$inout2 # 0 latency
369 vmovups $inout3,-0x30($out)
370 vmovdqa $Z2,$inout3 # 0 latency
371 vmovups $inout4,-0x20($out)
372 vmovdqa $Z3,$inout4 # 0 latency
373 vmovups $inout5,-0x10($out)
374 vmovdqa $Hkey,$inout5 # 0 latency
375 vmovdqu 0x20+8(%rsp),$Z3 # I[5]
379 vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled
380 vpxor $Z0,$Xi,$Xi # modulo-scheduled
383 .size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
385 ######################################################################
387 # size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
388 # const AES_KEY *key, unsigned char iv[16],
389 # struct { u128 Xi,H,Htbl[9]; } *Xip);
391 .globl aesni_gcm_decrypt
392 .type aesni_gcm_decrypt,\@function,6
396 cmp \$0x60,$len # minimal accepted length
399 lea (%rsp),%rax # save stack pointer
407 $code.=<<___ if ($win64);
409 movaps %xmm6,-0xd8(%rax)
410 movaps %xmm7,-0xc8(%rax)
411 movaps %xmm8,-0xb8(%rax)
412 movaps %xmm9,-0xa8(%rax)
413 movaps %xmm10,-0x98(%rax)
414 movaps %xmm11,-0x88(%rax)
415 movaps %xmm12,-0x78(%rax)
416 movaps %xmm13,-0x68(%rax)
417 movaps %xmm14,-0x58(%rax)
418 movaps %xmm15,-0x48(%rax)
424 vmovdqu ($ivp),$T1 # input counter value
426 mov 12($ivp),$counter
427 lea .Lbswap_mask(%rip),$const
428 vmovdqu ($Xip),$Xi # load Xi
429 and \$-64,%rsp # ensure stack alignment
430 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
431 lea 0x80($key),$key # size optimization
432 lea 0x20+0x20($Xip),$Xip # size optimization
433 mov 0xf0-0x80($key),$rounds
436 vmovdqu 0x50($inp),$Z3 # I[5]
438 vmovdqu 0x40($inp),$Z0
439 lea -0xc0($inp,$len),$end0
440 vmovdqu 0x30($inp),$Z1
443 vmovdqu 0x20($inp),$Z2
444 vpshufb $Ii,$Z3,$Z3 # passed to _aesni_ctr32_ghash_6x
445 vmovdqu 0x10($inp),$T2
449 vmovdqu $Z0,0x30(%rsp)
451 vmovdqu $Z1,0x40(%rsp)
453 vmovdqu $Z2,0x50(%rsp)
454 vpshufb $Ii,$Hkey,$Hkey
455 vmovdqu $T2,0x60(%rsp)
456 vmovdqu $Hkey,0x70(%rsp)
458 call _aesni_ctr32_ghash_6x
460 vmovups $inout0,-0x60($out) # save output
461 vmovups $inout1,-0x50($out)
462 vmovups $inout2,-0x40($out)
463 vmovups $inout3,-0x30($out)
464 vmovups $inout4,-0x20($out)
465 vmovups $inout5,-0x10($out)
467 vpshufb ($const),$Xi,$Xi # .Lbswap_mask
468 vmovdqu $Xi,-0x40($Xip) # output Xi
472 $code.=<<___ if ($win64);
473 movaps -0xd8(%rax),%xmm6
474 movaps -0xd8(%rax),%xmm7
475 movaps -0xb8(%rax),%xmm8
476 movaps -0xa8(%rax),%xmm9
477 movaps -0x98(%rax),%xmm10
478 movaps -0x88(%rax),%xmm11
479 movaps -0x78(%rax),%xmm12
480 movaps -0x68(%rax),%xmm13
481 movaps -0x58(%rax),%xmm14
482 movaps -0x48(%rax),%xmm15
491 lea (%rax),%rsp # restore %rsp
493 mov $ret,%rax # return value
495 .size aesni_gcm_decrypt,.-aesni_gcm_decrypt
499 .type _aesni_ctr32_6x,\@abi-omnipotent
502 vmovdqu 0x00-0x80($key),$Z0 # borrow $Z0 for $rndkey
503 vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
505 vmovups 0x10-0x80($key),$rndkey
506 lea 0x20-0x80($key),%r12
507 vpxor $Z0,$T1,$inout0
510 vpaddb $T2,$T1,$inout1
511 vpaddb $T2,$inout1,$inout2
512 vpxor $Z0,$inout1,$inout1
513 vpaddb $T2,$inout2,$inout3
514 vpxor $Z0,$inout2,$inout2
515 vpaddb $T2,$inout3,$inout4
516 vpxor $Z0,$inout3,$inout3
517 vpaddb $T2,$inout4,$inout5
518 vpxor $Z0,$inout4,$inout4
519 vpaddb $T2,$inout5,$T1
520 vpxor $Z0,$inout5,$inout5
525 vaesenc $rndkey,$inout0,$inout0
526 vaesenc $rndkey,$inout1,$inout1
527 vaesenc $rndkey,$inout2,$inout2
528 vaesenc $rndkey,$inout3,$inout3
529 vaesenc $rndkey,$inout4,$inout4
530 vaesenc $rndkey,$inout5,$inout5
531 vmovups (%r12),$rndkey
536 vmovdqu (%r12),$Hkey # last round key
537 vaesenc $rndkey,$inout0,$inout0
538 vpxor 0x00($inp),$Hkey,$Z0
539 vaesenc $rndkey,$inout1,$inout1
540 vpxor 0x10($inp),$Hkey,$Z1
541 vaesenc $rndkey,$inout2,$inout2
542 vpxor 0x20($inp),$Hkey,$Z2
543 vaesenc $rndkey,$inout3,$inout3
544 vpxor 0x30($inp),$Hkey,$Xi
545 vaesenc $rndkey,$inout4,$inout4
546 vpxor 0x40($inp),$Hkey,$T2
547 vaesenc $rndkey,$inout5,$inout5
548 vpxor 0x50($inp),$Hkey,$Hkey
551 vaesenclast $Z0,$inout0,$inout0
552 vaesenclast $Z1,$inout1,$inout1
553 vaesenclast $Z2,$inout2,$inout2
554 vaesenclast $Xi,$inout3,$inout3
555 vaesenclast $T2,$inout4,$inout4
556 vaesenclast $Hkey,$inout5,$inout5
557 vmovups $inout0,0x00($out)
558 vmovups $inout1,0x10($out)
559 vmovups $inout2,0x20($out)
560 vmovups $inout3,0x30($out)
561 vmovups $inout4,0x40($out)
562 vmovups $inout5,0x50($out)
568 vpshufb $Ii,$T1,$Z2 # byte-swap counter
569 vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb
570 vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb
571 vpaddd $Z1,$Z2,$inout2
572 vpaddd $Z1,$inout1,$inout3
573 vpshufb $Ii,$inout1,$inout1
574 vpaddd $Z1,$inout2,$inout4
575 vpshufb $Ii,$inout2,$inout2
576 vpxor $Z0,$inout1,$inout1
577 vpaddd $Z1,$inout3,$inout5
578 vpshufb $Ii,$inout3,$inout3
579 vpxor $Z0,$inout2,$inout2
580 vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value
581 vpshufb $Ii,$inout4,$inout4
582 vpxor $Z0,$inout3,$inout3
583 vpshufb $Ii,$inout5,$inout5
584 vpxor $Z0,$inout4,$inout4
585 vpshufb $Ii,$T1,$T1 # next counter value
586 vpxor $Z0,$inout5,$inout5
588 .size _aesni_ctr32_6x,.-_aesni_ctr32_6x
590 .globl aesni_gcm_encrypt
591 .type aesni_gcm_encrypt,\@function,6
595 cmp \$0x60*3,$len # minimal accepted length
598 lea (%rsp),%rax # save stack pointer
606 $code.=<<___ if ($win64);
608 movaps %xmm6,-0xd8(%rax)
609 movaps %xmm7,-0xc8(%rax)
610 movaps %xmm8,-0xb8(%rax)
611 movaps %xmm9,-0xa8(%rax)
612 movaps %xmm10,-0x98(%rax)
613 movaps %xmm11,-0x88(%rax)
614 movaps %xmm12,-0x78(%rax)
615 movaps %xmm13,-0x68(%rax)
616 movaps %xmm14,-0x58(%rax)
617 movaps %xmm15,-0x48(%rax)
623 vmovdqu ($ivp),$T1 # input counter value
625 mov 12($ivp),$counter
626 lea .Lbswap_mask(%rip),$const
627 lea 0x80($key),$key # size optimization
628 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
629 and \$-64,%rsp # ensure stack alignment
630 mov 0xf0-0x80($key),$rounds
633 lea -0xc0($out,$len),$end0
637 vpshufb $Ii,$inout0,$Xi # save bswapped output on stack
638 vpshufb $Ii,$inout1,$T2
639 vmovdqu $Xi,0x70(%rsp)
640 vpshufb $Ii,$inout2,$Z0
641 vmovdqu $T2,0x60(%rsp)
642 vpshufb $Ii,$inout3,$Z1
643 vmovdqu $Z0,0x50(%rsp)
644 vpshufb $Ii,$inout4,$Z2
645 vmovdqu $Z1,0x40(%rsp)
646 vpshufb $Ii,$inout5,$Z3 # passed to _aesni_ctr32_ghash_6x
647 vmovdqu $Z2,0x30(%rsp)
651 vmovdqu ($Xip),$Xi # load Xi
652 lea 0x20+0x20($Xip),$Xip # size optimization
657 call _aesni_ctr32_ghash_6x
658 vmovdqu 0x20(%rsp),$Z3 # I[5]
659 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
660 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
661 vpunpckhqdq $Z3,$Z3,$T1
662 vmovdqu 0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
663 vmovups $inout0,-0x60($out) # save output
664 vpshufb $Ii,$inout0,$inout0 # but keep bswapped copy
666 vmovups $inout1,-0x50($out)
667 vpshufb $Ii,$inout1,$inout1
668 vmovups $inout2,-0x40($out)
669 vpshufb $Ii,$inout2,$inout2
670 vmovups $inout3,-0x30($out)
671 vpshufb $Ii,$inout3,$inout3
672 vmovups $inout4,-0x20($out)
673 vpshufb $Ii,$inout4,$inout4
674 vmovups $inout5,-0x10($out)
675 vpshufb $Ii,$inout5,$inout5
676 vmovdqu $inout0,0x10(%rsp) # free $inout0
678 { my ($HK,$T3)=($rndkey,$inout0);
681 vmovdqu 0x30(%rsp),$Z2 # I[4]
682 vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
683 vpunpckhqdq $Z2,$Z2,$T2
684 vpclmulqdq \$0x00,$Hkey,$Z3,$Z1
686 vpclmulqdq \$0x11,$Hkey,$Z3,$Z3
687 vpclmulqdq \$0x00,$HK,$T1,$T1
689 vmovdqu 0x40(%rsp),$T3 # I[3]
690 vpclmulqdq \$0x00,$Ii,$Z2,$Z0
691 vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3
693 vpunpckhqdq $T3,$T3,$Z1
694 vpclmulqdq \$0x11,$Ii,$Z2,$Z2
697 vpclmulqdq \$0x10,$HK,$T2,$T2
698 vmovdqu 0x50-0x20($Xip),$HK
701 vmovdqu 0x50(%rsp),$T1 # I[2]
702 vpclmulqdq \$0x00,$Hkey,$T3,$Z3
703 vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
705 vpunpckhqdq $T1,$T1,$Z0
706 vpclmulqdq \$0x11,$Hkey,$T3,$T3
709 vpclmulqdq \$0x00,$HK,$Z1,$Z1
712 vmovdqu 0x60(%rsp),$T2 # I[1]
713 vpclmulqdq \$0x00,$Ii,$T1,$Z2
714 vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5
716 vpunpckhqdq $T2,$T2,$Z3
717 vpclmulqdq \$0x11,$Ii,$T1,$T1
720 vpclmulqdq \$0x10,$HK,$Z0,$Z0
721 vmovdqu 0x80-0x20($Xip),$HK
724 vpxor 0x70(%rsp),$Xi,$Xi # accumulate I[0]
725 vpclmulqdq \$0x00,$Hkey,$T2,$Z1
726 vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
727 vpunpckhqdq $Xi,$Xi,$T3
729 vpclmulqdq \$0x11,$Hkey,$T2,$T2
732 vpclmulqdq \$0x00,$HK,$Z3,$Z3
735 vpclmulqdq \$0x00,$Ii,$Xi,$Z2
736 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
737 vpunpckhqdq $inout5,$inout5,$T1
738 vpclmulqdq \$0x11,$Ii,$Xi,$Xi
739 vpxor $inout5,$T1,$T1
741 vpclmulqdq \$0x10,$HK,$T3,$T3
742 vmovdqu 0x20-0x20($Xip),$HK
746 vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
747 vpxor $Z1,$Z3,$T3 # aggregated Karatsuba post-processing
748 vpclmulqdq \$0x00,$Hkey,$inout5,$Z0
750 vpunpckhqdq $inout4,$inout4,$T2
751 vpclmulqdq \$0x11,$Hkey,$inout5,$inout5
752 vpxor $inout4,$T2,$T2
754 vpclmulqdq \$0x00,$HK,$T1,$T1
759 vpclmulqdq \$0x00,$Ii,$inout4,$Z1
760 vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3
762 vpunpckhqdq $inout3,$inout3,$T3
763 vpclmulqdq \$0x11,$Ii,$inout4,$inout4
764 vpxor $inout3,$T3,$T3
765 vpxor $inout5,$inout4,$inout4
766 vpalignr \$8,$Xi,$Xi,$inout5 # 1st phase
767 vpclmulqdq \$0x10,$HK,$T2,$T2
768 vmovdqu 0x50-0x20($Xip),$HK
771 vpclmulqdq \$0x00,$Hkey,$inout3,$Z0
772 vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
774 vpunpckhqdq $inout2,$inout2,$T1
775 vpclmulqdq \$0x11,$Hkey,$inout3,$inout3
776 vpxor $inout2,$T1,$T1
777 vpxor $inout4,$inout3,$inout3
778 vxorps 0x10(%rsp),$Z3,$Z3 # accumulate $inout0
779 vpclmulqdq \$0x00,$HK,$T3,$T3
782 vpclmulqdq \$0x10,0x10($const),$Xi,$Xi
783 vxorps $inout5,$Xi,$Xi
785 vpclmulqdq \$0x00,$Ii,$inout2,$Z1
786 vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5
788 vpunpckhqdq $inout1,$inout1,$T2
789 vpclmulqdq \$0x11,$Ii,$inout2,$inout2
790 vpxor $inout1,$T2,$T2
791 vpalignr \$8,$Xi,$Xi,$inout5 # 2nd phase
792 vpxor $inout3,$inout2,$inout2
793 vpclmulqdq \$0x10,$HK,$T1,$T1
794 vmovdqu 0x80-0x20($Xip),$HK
797 vxorps $Z3,$inout5,$inout5
798 vpclmulqdq \$0x10,0x10($const),$Xi,$Xi
799 vxorps $inout5,$Xi,$Xi
801 vpclmulqdq \$0x00,$Hkey,$inout1,$Z0
802 vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
804 vpunpckhqdq $Xi,$Xi,$T3
805 vpclmulqdq \$0x11,$Hkey,$inout1,$inout1
807 vpxor $inout2,$inout1,$inout1
808 vpclmulqdq \$0x00,$HK,$T2,$T2
811 vpclmulqdq \$0x00,$Ii,$Xi,$Z1
812 vpclmulqdq \$0x11,$Ii,$Xi,$Z3
814 vpclmulqdq \$0x10,$HK,$T3,$Z2
815 vpxor $inout1,$Z3,$Z3
818 vpxor $Z1,$Z3,$Z0 # aggregated Karatsuba post-processing
821 vmovdqu 0x10($const),$Hkey # .Lpoly
826 vpalignr \$8,$Xi,$Xi,$T2 # 1st phase
827 vpclmulqdq \$0x10,$Hkey,$Xi,$Xi
830 vpalignr \$8,$Xi,$Xi,$T2 # 2nd phase
831 vpclmulqdq \$0x10,$Hkey,$Xi,$Xi
837 vpshufb ($const),$Xi,$Xi # .Lbswap_mask
838 vmovdqu $Xi,-0x40($Xip) # output Xi
842 $code.=<<___ if ($win64);
843 movaps -0xd8(%rax),%xmm6
844 movaps -0xc8(%rax),%xmm7
845 movaps -0xb8(%rax),%xmm8
846 movaps -0xa8(%rax),%xmm9
847 movaps -0x98(%rax),%xmm10
848 movaps -0x88(%rax),%xmm11
849 movaps -0x78(%rax),%xmm12
850 movaps -0x68(%rax),%xmm13
851 movaps -0x58(%rax),%xmm14
852 movaps -0x48(%rax),%xmm15
861 lea (%rax),%rsp # restore %rsp
863 mov $ret,%rax # return value
865 .size aesni_gcm_encrypt,.-aesni_gcm_encrypt
871 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
873 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
875 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
877 .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
879 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
880 .asciz "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
890 .extern __imp_RtlVirtualUnwind
891 .type gcm_se_handler,\@abi-omnipotent
905 mov 120($context),%rax # pull context->Rax
906 mov 248($context),%rbx # pull context->Rip
908 mov 8($disp),%rsi # disp->ImageBase
909 mov 56($disp),%r11 # disp->HandlerData
911 mov 0(%r11),%r10d # HandlerData[0]
912 lea (%rsi,%r10),%r10 # prologue label
913 cmp %r10,%rbx # context->Rip<prologue label
916 mov 152($context),%rax # pull context->Rsp
918 mov 4(%r11),%r10d # HandlerData[1]
919 lea (%rsi,%r10),%r10 # epilogue label
920 cmp %r10,%rbx # context->Rip>=epilogue label
921 jae .Lcommon_seh_tail
923 mov 120($context),%rax # pull context->Rax
931 mov %r15,240($context)
932 mov %r14,232($context)
933 mov %r13,224($context)
934 mov %r12,216($context)
935 mov %rbp,160($context)
936 mov %rbx,144($context)
938 lea -0xd8(%rax),%rsi # %xmm save area
939 lea 512($context),%rdi # & context.Xmm6
940 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
941 .long 0xa548f3fc # cld; rep movsq
946 mov %rax,152($context) # restore context->Rsp
947 mov %rsi,168($context) # restore context->Rsi
948 mov %rdi,176($context) # restore context->Rdi
950 mov 40($disp),%rdi # disp->ContextRecord
951 mov $context,%rsi # context
952 mov \$154,%ecx # sizeof(CONTEXT)
953 .long 0xa548f3fc # cld; rep movsq
956 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
957 mov 8(%rsi),%rdx # arg2, disp->ImageBase
958 mov 0(%rsi),%r8 # arg3, disp->ControlPc
959 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
960 mov 40(%rsi),%r10 # disp->ContextRecord
961 lea 56(%rsi),%r11 # &disp->HandlerData
962 lea 24(%rsi),%r12 # &disp->EstablisherFrame
963 mov %r10,32(%rsp) # arg5
964 mov %r11,40(%rsp) # arg6
965 mov %r12,48(%rsp) # arg7
966 mov %rcx,56(%rsp) # arg8, (NULL)
967 call *__imp_RtlVirtualUnwind(%rip)
969 mov \$1,%eax # ExceptionContinueSearch
981 .size gcm_se_handler,.-gcm_se_handler
985 .rva .LSEH_begin_aesni_gcm_decrypt
986 .rva .LSEH_end_aesni_gcm_decrypt
987 .rva .LSEH_gcm_dec_info
989 .rva .LSEH_begin_aesni_gcm_encrypt
990 .rva .LSEH_end_aesni_gcm_encrypt
991 .rva .LSEH_gcm_enc_info
997 .rva .Lgcm_dec_body,.Lgcm_dec_abort
1001 .rva .Lgcm_enc_body,.Lgcm_enc_abort
1005 $code=<<___; # assembler is too old
1008 .globl aesni_gcm_encrypt
1009 .type aesni_gcm_encrypt,\@abi-omnipotent
1013 .size aesni_gcm_encrypt,.-aesni_gcm_encrypt
1015 .globl aesni_gcm_decrypt
1016 .type aesni_gcm_decrypt,\@abi-omnipotent
1020 .size aesni_gcm_decrypt,.-aesni_gcm_decrypt
1024 $code =~ s/\`([^\`]*)\`/eval($1)/gem;