2 # Copyright 2011-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
22 # $output is the last argument if it looks like a file (it has an extension)
23 # $flavour is the first argument if it doesn't look like a file
24 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
25 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
27 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
29 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
30 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
31 ( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
32 die "can't locate x86_64-xlate.pl";
34 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
35 or die "can't call $xlate: $!";
40 %PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata
41 $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
49 ($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
50 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
53 .globl padlock_capability
54 .type padlock_capability,\@abi-omnipotent
61 cmp \$`"0x".unpack("H*",'tneC')`,%ebx
63 cmp \$`"0x".unpack("H*",'Hrua')`,%edx
65 cmp \$`"0x".unpack("H*",'slua')`,%ecx
69 cmp \$`"0x".unpack("H*",'hS ')`,%ebx
71 cmp \$`"0x".unpack("H*",'hgna')`,%edx
73 cmp \$`"0x".unpack("H*",' ia')`,%ecx
86 or \$0x10,%eax # set Nano bit#4
90 .size padlock_capability,.-padlock_capability
92 .globl padlock_key_bswap
93 .type padlock_key_bswap,\@abi-omnipotent,0
105 .size padlock_key_bswap,.-padlock_key_bswap
107 .globl padlock_verify_context
108 .type padlock_verify_context,\@abi-omnipotent
110 padlock_verify_context:
113 lea .Lpadlock_saved_context(%rip),%rax
114 call _padlock_verify_ctx
117 .size padlock_verify_context,.-padlock_verify_context
119 .type _padlock_verify_ctx,\@abi-omnipotent
132 .size _padlock_verify_ctx,.-_padlock_verify_ctx
134 .globl padlock_reload_key
135 .type padlock_reload_key,\@abi-omnipotent
141 .size padlock_reload_key,.-padlock_reload_key
143 .globl padlock_aes_block
144 .type padlock_aes_block,\@function,3
149 lea 32($ctx),%rbx # key
150 lea 16($ctx),$ctx # control word
151 .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb
154 .size padlock_aes_block,.-padlock_aes_block
156 .globl padlock_xstore
157 .type padlock_xstore,\@function,2
161 .byte 0x0f,0xa7,0xc0 # xstore
163 .size padlock_xstore,.-padlock_xstore
165 .globl padlock_sha1_oneshot
166 .type padlock_sha1_oneshot,\@function,3
168 padlock_sha1_oneshot:
170 mov %rdi,%rdx # put aside %rdi
171 movups (%rdi),%xmm0 # copy-in context
178 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
182 movups %xmm0,(%rdx) # copy-out context
185 .size padlock_sha1_oneshot,.-padlock_sha1_oneshot
187 .globl padlock_sha1_blocks
188 .type padlock_sha1_blocks,\@function,3
192 mov %rdi,%rdx # put aside %rdi
193 movups (%rdi),%xmm0 # copy-in context
200 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
204 movups %xmm0,(%rdx) # copy-out context
207 .size padlock_sha1_blocks,.-padlock_sha1_blocks
209 .globl padlock_sha256_oneshot
210 .type padlock_sha256_oneshot,\@function,3
212 padlock_sha256_oneshot:
214 mov %rdi,%rdx # put aside %rdi
215 movups (%rdi),%xmm0 # copy-in context
217 movups 16(%rdi),%xmm1
220 movaps %xmm1,16(%rsp)
222 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
224 movaps 16(%rsp),%xmm1
226 movups %xmm0,(%rdx) # copy-out context
227 movups %xmm1,16(%rdx)
229 .size padlock_sha256_oneshot,.-padlock_sha256_oneshot
231 .globl padlock_sha256_blocks
232 .type padlock_sha256_blocks,\@function,3
234 padlock_sha256_blocks:
236 mov %rdi,%rdx # put aside %rdi
237 movups (%rdi),%xmm0 # copy-in context
239 movups 16(%rdi),%xmm1
242 movaps %xmm1,16(%rsp)
244 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
246 movaps 16(%rsp),%xmm1
248 movups %xmm0,(%rdx) # copy-out context
249 movups %xmm1,16(%rdx)
251 .size padlock_sha256_blocks,.-padlock_sha256_blocks
253 .globl padlock_sha512_blocks
254 .type padlock_sha512_blocks,\@function,3
256 padlock_sha512_blocks:
258 mov %rdi,%rdx # put aside %rdi
259 movups (%rdi),%xmm0 # copy-in context
261 movups 16(%rdi),%xmm1
262 movups 32(%rdi),%xmm2
263 movups 48(%rdi),%xmm3
266 movaps %xmm1,16(%rsp)
267 movaps %xmm2,32(%rsp)
268 movaps %xmm3,48(%rsp)
269 .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512
271 movaps 16(%rsp),%xmm1
272 movaps 32(%rsp),%xmm2
273 movaps 48(%rsp),%xmm3
275 movups %xmm0,(%rdx) # copy-out context
276 movups %xmm1,16(%rdx)
277 movups %xmm2,32(%rdx)
278 movups %xmm3,48(%rdx)
280 .size padlock_sha512_blocks,.-padlock_sha512_blocks
284 my ($mode,$opcode) = @_;
285 # int padlock_$mode_encrypt(void *out, const void *inp,
286 # struct padlock_cipher_data *ctx, size_t len);
288 .globl padlock_${mode}_encrypt
289 .type padlock_${mode}_encrypt,\@function,4
291 padlock_${mode}_encrypt:
300 lea .Lpadlock_saved_context(%rip),%rax
303 call _padlock_verify_ctx
304 lea 16($ctx),$ctx # control word
307 testl \$`1<<5`,($ctx) # align bit in control word
308 jnz .L${mode}_aligned
310 setz %al # !out_misaligned
312 setz %bl # !inp_misaligned
314 jnz .L${mode}_aligned
316 mov \$$PADLOCK_CHUNK,$chunk
317 not %rax # out_misaligned?-1:0
320 cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
321 and $chunk,%rax # out_misaligned?chunk:0
324 and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
326 mov \$$PADLOCK_CHUNK,%rax
327 cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK
329 $code.=<<___ if ($mode eq "ctr32");
331 mov -4($ctx),%eax # pull 32-bit counter
334 and \$`$PADLOCK_CHUNK/16-1`,%eax
335 mov \$$PADLOCK_CHUNK,$chunk
339 cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK
342 $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
345 mov $inp,%rax # check if prefetch crosses page
350 and \$0xfff,%rax # distance to page boundary
351 cmp \$$PADLOCK_PREFETCH{$mode},%rax
352 mov \$-$PADLOCK_PREFETCH{$mode},%rax
353 cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1
355 jz .L${mode}_unaligned_tail
361 cmp $len,$chunk # ctr32 artefact
362 cmova $len,$chunk # ctr32 artefact
363 mov $out,%r8 # save parameters
368 test \$0x0f,$out # out_misaligned
370 test \$0x0f,$inp # inp_misaligned
371 jz .L${mode}_inp_aligned
373 .byte 0xf3,0x48,0xa5 # rep movsq
377 .L${mode}_inp_aligned:
378 lea -16($ctx),%rax # ivp
379 lea 16($ctx),%rbx # key
381 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
383 $code.=<<___ if ($mode !~ /ecb|ctr/);
385 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
387 $code.=<<___ if ($mode eq "ctr32");
388 mov -4($ctx),%eax # pull 32-bit counter
389 test \$0xffff0000,%eax
390 jnz .L${mode}_no_carry
398 mov %r8,$out # restore parameters
401 jz .L${mode}_out_aligned
405 .byte 0xf3,0x48,0xa5 # rep movsq
407 .L${mode}_out_aligned:
413 mov \$$PADLOCK_CHUNK,$chunk
415 if (!$PADLOCK_PREFETCH{$mode}) {
425 $code.=<<___ if ($mode eq "ctr32");
427 mov $inp,%rax # check if prefetch crosses page
432 and \$0xfff,%rax # distance to page boundary
433 cmp \$$PADLOCK_PREFETCH{$mode},%rax
434 mov \$-$PADLOCK_PREFETCH{$mode},%rax
440 .L${mode}_unaligned_tail:
444 mov $out,%r8 # save parameters
446 sub %rax,%rsp # alloca
449 .byte 0xf3,0x48,0xa5 # rep movsq
451 mov %r8, $out # restore parameters
477 $code.=<<___ if ($mode eq "ctr32");
478 mov -4($ctx),%eax # pull 32-bit counter
482 mov \$`16*0x10000`,$chunk
486 cmova %rax,$chunk # don't let counter cross 2^16
488 jbe .L${mode}_aligned_skip
490 .L${mode}_aligned_loop:
491 mov $len,%r10 # save parameters
495 lea -16($ctx),%rax # ivp
496 lea 16($ctx),%rbx # key
497 shr \$4,$len # len/=AES_BLOCK_SIZE
498 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
500 mov -4($ctx),%eax # pull 32-bit counter
506 mov %r10,$len # restore parameters
508 mov \$`16*0x10000`,$chunk
511 jae .L${mode}_aligned_loop
513 .L${mode}_aligned_skip:
515 $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
518 and \$0xfff,%rbp # distance to page boundary
520 cmp \$$PADLOCK_PREFETCH{$mode},%rbp
521 mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp
523 and $len,%rbp # remainder
525 jz .L${mode}_aligned_tail
528 lea -16($ctx),%rax # ivp
529 lea 16($ctx),%rbx # key
530 shr \$4,$len # len/=AES_BLOCK_SIZE
531 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
533 $code.=<<___ if ($mode !~ /ecb|ctr/);
535 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
537 $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
538 test %rbp,%rbp # check remainder
541 .L${mode}_aligned_tail:
549 .byte 0xf3,0x48,0xa5 # rep movsq
563 .size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt
567 &generate_mode("ecb",0xc8);
568 &generate_mode("cbc",0xd0);
569 &generate_mode("cfb",0xe0);
570 &generate_mode("ofb",0xe8);
571 &generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
574 .asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
578 .Lpadlock_saved_context:
581 $code =~ s/\`([^\`]*)\`/eval($1)/gem;