3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
12 # Assembler helpers for Padlock engine. See even e_padlock-x86.pl for
17 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
19 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
21 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
22 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
23 ( $xlate="${dir}../../crypto/perlasm/x86_64-xlate.pl" and -f $xlate) or
24 die "can't locate x86_64-xlate.pl";
26 open OUT,"| \"$^X\" $xlate $flavour $output";
31 %PADLOCK_PREFETCH=(ecb=>128, cbc=>64, ctr32=>32); # prefetch errata
32 $PADLOCK_CHUNK=512; # Must be a power of 2 between 32 and 2^20
40 ($arg1,$arg2,$arg3,$arg4)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
41 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
44 .globl padlock_capability
45 .type padlock_capability,\@abi-omnipotent
52 cmp \$`"0x".unpack("H*",'tneC')`,%ebx
54 cmp \$`"0x".unpack("H*",'Hrua')`,%edx
56 cmp \$`"0x".unpack("H*",'slua')`,%ecx
68 or \$0x10,%eax # set Nano bit#4
72 .size padlock_capability,.-padlock_capability
74 .globl padlock_key_bswap
75 .type padlock_key_bswap,\@abi-omnipotent,0
87 .size padlock_key_bswap,.-padlock_key_bswap
89 .globl padlock_verify_context
90 .type padlock_verify_context,\@abi-omnipotent
92 padlock_verify_context:
95 lea .Lpadlock_saved_context(%rip),%rax
96 call _padlock_verify_ctx
99 .size padlock_verify_context,.-padlock_verify_context
101 .type _padlock_verify_ctx,\@abi-omnipotent
114 .size _padlock_verify_ctx,.-_padlock_verify_ctx
116 .globl padlock_reload_key
117 .type padlock_reload_key,\@abi-omnipotent
123 .size padlock_reload_key,.-padlock_reload_key
125 .globl padlock_aes_block
126 .type padlock_aes_block,\@function,3
131 lea 32($ctx),%rbx # key
132 lea 16($ctx),$ctx # control word
133 .byte 0xf3,0x0f,0xa7,0xc8 # rep xcryptecb
136 .size padlock_aes_block,.-padlock_aes_block
138 .globl padlock_xstore
139 .type padlock_xstore,\@function,2
143 .byte 0x0f,0xa7,0xc0 # xstore
145 .size padlock_xstore,.-padlock_xstore
147 .globl padlock_sha1_oneshot
148 .type padlock_sha1_oneshot,\@function,3
150 padlock_sha1_oneshot:
152 mov %rdi,%rdx # put aside %rdi
153 movups (%rdi),%xmm0 # copy-in context
160 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
164 movups %xmm0,(%rdx) # copy-out context
167 .size padlock_sha1_oneshot,.-padlock_sha1_oneshot
169 .globl padlock_sha1_blocks
170 .type padlock_sha1_blocks,\@function,3
174 mov %rdi,%rdx # put aside %rdi
175 movups (%rdi),%xmm0 # copy-in context
182 .byte 0xf3,0x0f,0xa6,0xc8 # rep xsha1
186 movups %xmm0,(%rdx) # copy-out context
189 .size padlock_sha1_blocks,.-padlock_sha1_blocks
191 .globl padlock_sha256_oneshot
192 .type padlock_sha256_oneshot,\@function,3
194 padlock_sha256_oneshot:
196 mov %rdi,%rdx # put aside %rdi
197 movups (%rdi),%xmm0 # copy-in context
199 movups 16(%rdi),%xmm1
202 movaps %xmm1,16(%rsp)
204 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
206 movaps 16(%rsp),%xmm1
208 movups %xmm0,(%rdx) # copy-out context
209 movups %xmm1,16(%rdx)
211 .size padlock_sha256_oneshot,.-padlock_sha256_oneshot
213 .globl padlock_sha256_blocks
214 .type padlock_sha256_blocks,\@function,3
216 padlock_sha256_blocks:
218 mov %rdi,%rdx # put aside %rdi
219 movups (%rdi),%xmm0 # copy-in context
221 movups 16(%rdi),%xmm1
224 movaps %xmm1,16(%rsp)
226 .byte 0xf3,0x0f,0xa6,0xd0 # rep xsha256
228 movaps 16(%rsp),%xmm1
230 movups %xmm0,(%rdx) # copy-out context
231 movups %xmm1,16(%rdx)
233 .size padlock_sha256_blocks,.-padlock_sha256_blocks
235 .globl padlock_sha512_blocks
236 .type padlock_sha512_blocks,\@function,3
238 padlock_sha512_blocks:
240 mov %rdi,%rdx # put aside %rdi
241 movups (%rdi),%xmm0 # copy-in context
243 movups 16(%rdi),%xmm1
244 movups 32(%rdi),%xmm2
245 movups 48(%rdi),%xmm3
248 movaps %xmm1,16(%rsp)
249 movaps %xmm2,32(%rsp)
250 movaps %xmm3,48(%rsp)
251 .byte 0xf3,0x0f,0xa6,0xe0 # rep xha512
253 movaps 16(%rsp),%xmm1
254 movaps 32(%rsp),%xmm2
255 movaps 48(%rsp),%xmm3
257 movups %xmm0,(%rdx) # copy-out context
258 movups %xmm1,16(%rdx)
259 movups %xmm2,32(%rdx)
260 movups %xmm3,48(%rdx)
262 .size padlock_sha512_blocks,.-padlock_sha512_blocks
266 my ($mode,$opcode) = @_;
267 # int padlock_$mode_encrypt(void *out, const void *inp,
268 # struct padlock_cipher_data *ctx, size_t len);
270 .globl padlock_${mode}_encrypt
271 .type padlock_${mode}_encrypt,\@function,4
273 padlock_${mode}_encrypt:
282 lea .Lpadlock_saved_context(%rip),%rax
285 call _padlock_verify_ctx
286 lea 16($ctx),$ctx # control word
289 testl \$`1<<5`,($ctx) # align bit in control word
290 jnz .L${mode}_aligned
292 setz %al # !out_misaligned
294 setz %bl # !inp_misaligned
296 jnz .L${mode}_aligned
298 mov \$$PADLOCK_CHUNK,$chunk
299 not %rax # out_misaligned?-1:0
302 cmovc $len,$chunk # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
303 and $chunk,%rax # out_misaligned?chunk:0
306 and \$$PADLOCK_CHUNK-1,$chunk # chunk%=PADLOCK_CHUNK
308 mov \$$PADLOCK_CHUNK,%rax
309 cmovz %rax,$chunk # chunk=chunk?:PADLOCK_CHUNK
311 $code.=<<___ if ($mode eq "ctr32");
313 mov -4($ctx),%eax # pull 32-bit counter
316 and \$`$PADLOCK_CHUNK/16-1`,%eax
317 mov \$$PADLOCK_CHUNK,$chunk
321 cmova %rax,$chunk # don't let counter cross PADLOCK_CHUNK
324 $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
327 mov $inp,%rax # check if prefetch crosses page
332 and \$0xfff,%rax # distance to page boundary
333 cmp \$$PADLOCK_PREFETCH{$mode},%rax
334 mov \$-$PADLOCK_PREFETCH{$mode},%rax
335 cmovae $chunk,%rax # mask=distance<prefetch?-prefetch:-1
337 jz .L${mode}_unaligned_tail
343 cmp $len,$chunk # ctr32 artefact
344 cmova $len,$chunk # ctr32 artefact
345 mov $out,%r8 # save parameters
350 test \$0x0f,$out # out_misaligned
352 test \$0x0f,$inp # inp_misaligned
353 jz .L${mode}_inp_aligned
355 .byte 0xf3,0x48,0xa5 # rep movsq
359 .L${mode}_inp_aligned:
360 lea -16($ctx),%rax # ivp
361 lea 16($ctx),%rbx # key
363 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
365 $code.=<<___ if ($mode !~ /ecb|ctr/);
367 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
369 $code.=<<___ if ($mode eq "ctr32");
370 mov -4($ctx),%eax # pull 32-bit counter
371 test \$0xffff0000,%eax
372 jnz .L${mode}_no_carry
380 mov %r8,$out # restore paramters
383 jz .L${mode}_out_aligned
387 .byte 0xf3,0x48,0xa5 # rep movsq
389 .L${mode}_out_aligned:
395 mov \$$PADLOCK_CHUNK,$chunk
397 if (!$PADLOCK_PREFETCH{$mode}) {
407 $code.=<<___ if ($mode eq "ctr32");
409 mov $inp,%rax # check if prefetch crosses page
414 and \$0xfff,%rax # distance to page boundary
415 cmp \$$PADLOCK_PREFETCH{$mode},%rax
416 mov \$-$PADLOCK_PREFETCH{$mode},%rax
422 .L${mode}_unaligned_tail:
426 mov $out,%r8 # save parameters
428 sub %rax,%rsp # alloca
431 .byte 0xf3,0x48,0xa5 # rep movsq
433 mov %r8, $out # restore parameters
459 $code.=<<___ if ($mode eq "ctr32");
460 mov -4($ctx),%eax # pull 32-bit counter
464 mov \$`16*0x10000`,$chunk
468 cmova %rax,$chunk # don't let counter cross 2^16
470 jbe .L${mode}_aligned_skip
472 .L${mode}_aligned_loop:
473 mov $len,%r10 # save parameters
477 lea -16($ctx),%rax # ivp
478 lea 16($ctx),%rbx # key
479 shr \$4,$len # len/=AES_BLOCK_SIZE
480 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
482 mov -4($ctx),%eax # pull 32-bit counter
488 mov %r10,$len # restore paramters
490 mov \$`16*0x10000`,$chunk
493 jae .L${mode}_aligned_loop
495 .L${mode}_aligned_skip:
497 $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
500 and \$0xfff,%rbp # distance to page boundary
502 cmp \$$PADLOCK_PREFETCH{$mode},%rbp
503 mov \$$PADLOCK_PREFETCH{$mode}-1,%rbp
505 and $len,%rbp # remainder
507 jz .L${mode}_aligned_tail
510 lea -16($ctx),%rax # ivp
511 lea 16($ctx),%rbx # key
512 shr \$4,$len # len/=AES_BLOCK_SIZE
513 .byte 0xf3,0x0f,0xa7,$opcode # rep xcrypt*
515 $code.=<<___ if ($mode !~ /ecb|ctr/);
517 movdqa %xmm0,-16($ctx) # copy [or refresh] iv
519 $code.=<<___ if ($PADLOCK_PREFETCH{$mode});
520 test %rbp,%rbp # check remainder
523 .L${mode}_aligned_tail:
531 .byte 0xf3,0x48,0xa5 # rep movsq
545 .size padlock_${mode}_encrypt,.-padlock_${mode}_encrypt
549 &generate_mode("ecb",0xc8);
550 &generate_mode("cbc",0xd0);
551 &generate_mode("cfb",0xe0);
552 &generate_mode("ofb",0xe8);
553 &generate_mode("ctr32",0xd8); # all 64-bit CPUs have working CTR...
556 .asciz "VIA Padlock x86_64 module, CRYPTOGAMS by <appro\@openssl.org>"
560 .Lpadlock_saved_context:
563 $code =~ s/\`([^\`]*)\`/eval($1)/gem;