2 # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # Assembler helpers for Padlock engine. Compared to original engine
20 # version relying on inline assembler and compiled with gcc 3.4.6 it
21 # was measured to provide ~100% improvement on misaligned data in ECB
22 # mode and ~75% in CBC mode. For aligned data improvement can be
23 # observed for short inputs only, e.g. 45% for 64-byte messages in
24 # ECB mode, 20% in CBC. Difference in performance for aligned vs.
25 # misaligned data depends on misalignment and is either ~1.8x or 2.9x.
26 # These are approximately same factors as for hardware support, so
27 # there is little reason to rely on the latter. On the contrary, it
28 # might actually hurt performance in mixture of aligned and misaligned
29 # buffers, because a) if you choose to flip 'align' flag in control
30 # word on per-buffer basis, then you'd have to reload key context,
31 # which incurs penalty; b) if you choose to set 'align' flag
32 # permanently, it limits performance even for aligned data to ~1/2.
33 # All above mentioned results were collected on 1.5GHz C7. Nano on the
34 # other hand handles unaligned data more gracefully. Depending on
35 # algorithm and how unaligned data is, hardware can be up to 70% more
36 # efficient than below software alignment procedures, nor does 'align'
37 # flag have affect on aligned performance [if has any meaning at all].
38 # Therefore suggestion is to unconditionally set 'align' flag on Nano
39 # for optimal performance.
41 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42 push(@INC,"${dir}","${dir}../../crypto/perlasm");
46 open STDOUT,">$output";
50 %PADLOCK_PREFETCH=(ecb=>128, cbc=>64); # prefetch errata
51 $PADLOCK_CHUNK=512; # Must be a power of 2 larger than 16
59 &function_begin_B("padlock_capability");
72 &jnc (&label("noluck"));
75 &cmp ("ebx","0x".unpack("H*",'tneC'));
76 &jne (&label("noluck"));
77 &cmp ("edx","0x".unpack("H*",'Hrua'));
78 &jne (&label("noluck"));
79 &cmp ("ecx","0x".unpack("H*",'slua'));
80 &jne (&label("noluck"));
81 &mov ("eax",0xC0000000);
85 &cmp ("edx",0xC0000001);
86 &jb (&label("noluck"));
92 &cmp ("eax",0x06ff); # check for Nano
94 &mov ("eax",0xC0000001);
99 &shl ("ebx",4); # bit#4 denotes Nano
100 &and ("eax",0xffffffef);
102 &set_label("noluck");
105 &function_end_B("padlock_capability")
107 &function_begin_B("padlock_key_bswap");
108 &mov ("edx",&wparam(0));
109 &mov ("ecx",&DWP(240,"edx"));
110 &set_label("bswap_loop");
111 &mov ("eax",&DWP(0,"edx"));
113 &mov (&DWP(0,"edx"),"eax");
114 &lea ("edx",&DWP(4,"edx"));
116 &jnz (&label("bswap_loop"));
118 &function_end_B("padlock_key_bswap");
120 # This is heuristic key context tracing. At first one
121 # believes that one should use atomic swap instructions,
122 # but it's not actually necessary. Point is that if
123 # padlock_saved_context was changed by another thread
124 # after we've read it and before we compare it with ctx,
125 # our key *shall* be reloaded upon thread context switch
126 # and we are therefore set in either case...
127 &static_label("padlock_saved_context");
129 &function_begin_B("padlock_verify_context");
130 &mov ($ctx,&wparam(0));
131 &lea ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) :
132 &DWP(&label("padlock_saved_context")."-".&label("verify_pic_point")));
134 &call ("_padlock_verify_ctx");
135 &set_label("verify_pic_point");
136 &lea ("esp",&DWP(4,"esp"));
138 &function_end_B("padlock_verify_context");
140 &function_begin_B("_padlock_verify_ctx");
141 &add ("eax",&DWP(0,"esp")) if(!($::win32 or $::coff));# &padlock_saved_context
142 &bt (&DWP(4,"esp"),30); # eflags
143 &jnc (&label("verified"));
144 &cmp ($ctx,&DWP(0,"eax"));
145 &je (&label("verified"));
148 &set_label("verified");
149 &mov (&DWP(0,"eax"),$ctx);
151 &function_end_B("_padlock_verify_ctx");
153 &function_begin_B("padlock_reload_key");
157 &function_end_B("padlock_reload_key");
159 &function_begin_B("padlock_aes_block");
163 &mov ($out,&wparam(0)); # must be 16-byte aligned
164 &mov ($inp,&wparam(1)); # must be 16-byte aligned
165 &mov ($ctx,&wparam(2));
167 &lea ("ebx",&DWP(32,$ctx)); # key
168 &lea ($ctx,&DWP(16,$ctx)); # control word
169 &data_byte(0xf3,0x0f,0xa7,0xc8); # rep xcryptecb
174 &function_end_B("padlock_aes_block");
177 my ($mode,$opcode) = @_;
178 # int padlock_$mode_encrypt(void *out, const void *inp,
179 # struct padlock_cipher_data *ctx, size_t len);
180 &function_begin("padlock_${mode}_encrypt");
181 &mov ($out,&wparam(0));
182 &mov ($inp,&wparam(1));
183 &mov ($ctx,&wparam(2));
184 &mov ($len,&wparam(3));
186 &jnz (&label("${mode}_abort"));
188 &jnz (&label("${mode}_abort"));
189 &lea ("eax",($::win32 or $::coff) ? &DWP(&label("padlock_saved_context")) :
190 &DWP(&label("padlock_saved_context")."-".&label("${mode}_pic_point")));
193 &call ("_padlock_verify_ctx");
194 &set_label("${mode}_pic_point");
195 &lea ($ctx,&DWP(16,$ctx)); # control word
197 if ($mode eq "ctr32") {
198 &movq ("mm0",&QWP(-16,$ctx)); # load [upper part of] counter
201 &test (&DWP(0,$ctx),1<<5); # align bit in control word
202 &jnz (&label("${mode}_aligned"));
204 &setz ("al"); # !out_misaligned
206 &setz ("bl"); # !inp_misaligned
208 &jnz (&label("${mode}_aligned"));
211 &mov ($chunk,$PADLOCK_CHUNK);
212 ¬ ("eax"); # out_misaligned?-1:0
213 &lea ("ebp",&DWP(-24,"esp"));
215 &cmovc ($chunk,$len); # chunk=len>PADLOCK_CHUNK?PADLOCK_CHUNK:len
216 &and ("eax",$chunk); # out_misaligned?chunk:0
219 &and ($chunk,$PADLOCK_CHUNK-1); # chunk=len%PADLOCK_CHUNK
220 &lea ("esp",&DWP(0,"eax","ebp")); # alloca
221 &mov ("eax",$PADLOCK_CHUNK);
222 &cmovz ($chunk,"eax"); # chunk=chunk?:PADLOCK_CHUNK
226 &mov (&DWP(16,"ebp"),"eax");
227 if ($PADLOCK_PREFETCH{$mode}) {
229 &ja (&label("${mode}_loop"));
230 &mov ("eax",$inp); # check if prefetch crosses page
235 &and ("eax",0xfff); # distance to page boundary
236 &cmp ("eax",$PADLOCK_PREFETCH{$mode});
237 &mov ("eax",-$PADLOCK_PREFETCH{$mode});
238 &cmovae ("eax",$chunk); # mask=distance<prefetch?-prefetch:-1
240 &jz (&label("${mode}_unaligned_tail"));
242 &jmp (&label("${mode}_loop"));
244 &set_label("${mode}_loop",16);
245 &mov (&DWP(0,"ebp"),$out); # save parameters
246 &mov (&DWP(4,"ebp"),$inp);
247 &mov (&DWP(8,"ebp"),$len);
249 &mov (&DWP(12,"ebp"),$chunk); # chunk
250 if ($mode eq "ctr32") {
251 &mov ("ecx",&DWP(-4,$ctx));
253 &mov ("eax",&DWP(-8,$ctx)); # borrow $len
254 &set_label("${mode}_prepare");
255 &mov (&DWP(12,"esp",$out),"ecx");
257 &movq (&QWP(0,"esp",$out),"mm0");
259 &mov (&DWP(8,"esp",$out),"eax");
261 &lea ($out,&DWP(16,$out));
263 &jb (&label("${mode}_prepare"));
265 &mov (&DWP(-4,$ctx),"ecx");
266 &lea ($inp,&DWP(0,"esp"));
267 &lea ($out,&DWP(0,"esp"));
270 &test ($out,0x0f); # out_misaligned
271 &cmovnz ($out,"esp");
272 &test ($inp,0x0f); # inp_misaligned
273 &jz (&label("${mode}_inp_aligned"));
275 &data_byte(0xf3,0xa5); # rep movsl
279 &set_label("${mode}_inp_aligned");
281 &lea ("eax",&DWP(-16,$ctx)); # ivp
282 &lea ("ebx",&DWP(16,$ctx)); # key
283 &shr ($len,4); # len/=AES_BLOCK_SIZE
284 &data_byte(0xf3,0x0f,0xa7,$opcode); # rep xcrypt*
285 if ($mode !~ /ecb|ctr/) {
286 &movaps ("xmm0",&QWP(0,"eax"));
287 &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv
289 &mov ($out,&DWP(0,"ebp")); # restore parameters
290 &mov ($chunk,&DWP(12,"ebp"));
291 if ($mode eq "ctr32") {
292 &mov ($inp,&DWP(4,"ebp"));
294 &set_label("${mode}_xor");
295 &movups ("xmm1",&QWP(0,$inp,$len));
296 &lea ($len,&DWP(16,$len));
297 &pxor ("xmm1",&QWP(-16,"esp",$len));
298 &movups (&QWP(-16,$out,$len),"xmm1");
300 &jb (&label("${mode}_xor"));
303 &jz (&label("${mode}_out_aligned"));
305 &lea ($inp,&DWP(0,"esp"));
307 &data_byte(0xf3,0xa5); # rep movsl
309 &set_label("${mode}_out_aligned");
310 &mov ($inp,&DWP(4,"ebp"));
312 &mov ($len,&DWP(8,"ebp"));
316 &mov ($chunk,$PADLOCK_CHUNK);
317 if (!$PADLOCK_PREFETCH{$mode}) {
318 &jnz (&label("${mode}_loop"));
320 &jz (&label("${mode}_break"));
322 &jae (&label("${mode}_loop"));
324 &set_label("${mode}_unaligned_tail");
328 &sub ("esp","eax"); # alloca
329 &mov ("eax", $out); # save parameters
332 &lea ($out,&DWP(0,"esp"));
333 &data_byte(0xf3,0xa5); # rep movsl
335 &mov ($out,"eax"); # restore parameters
337 &jmp (&label("${mode}_loop"));
339 &set_label("${mode}_break",16);
341 if ($mode ne "ctr32") {
343 &je (&label("${mode}_done"));
345 &pxor ("xmm0","xmm0");
346 &lea ("eax",&DWP(0,"esp"));
347 &set_label("${mode}_bzero");
348 &movaps (&QWP(0,"eax"),"xmm0");
349 &lea ("eax",&DWP(16,"eax"));
351 &ja (&label("${mode}_bzero"));
353 &set_label("${mode}_done");
354 &mov ("ebp",&DWP(16,"ebp"));
355 &lea ("esp",&DWP(24,"ebp"));
356 if ($mode ne "ctr32") {
357 &jmp (&label("${mode}_exit"));
359 &set_label("${mode}_aligned",16);
360 if ($PADLOCK_PREFETCH{$mode}) {
361 &lea ("ebp",&DWP(0,$inp,$len));
363 &and ("ebp",0xfff); # distance to page boundary
365 &cmp ("ebp",$PADLOCK_PREFETCH{$mode});
366 &mov ("ebp",$PADLOCK_PREFETCH{$mode}-1);
367 &cmovae ("ebp","eax");
368 &and ("ebp",$len); # remainder
370 &jz (&label("${mode}_aligned_tail"));
372 &lea ("eax",&DWP(-16,$ctx)); # ivp
373 &lea ("ebx",&DWP(16,$ctx)); # key
374 &shr ($len,4); # len/=AES_BLOCK_SIZE
375 &data_byte(0xf3,0x0f,0xa7,$opcode); # rep xcrypt*
376 if ($mode ne "ecb") {
377 &movaps ("xmm0",&QWP(0,"eax"));
378 &movaps (&QWP(-16,$ctx),"xmm0"); # copy [or refresh] iv
380 if ($PADLOCK_PREFETCH{$mode}) {
382 &jz (&label("${mode}_exit"));
384 &set_label("${mode}_aligned_tail");
386 &lea ("ebp",&DWP(-24,"esp"));
392 &mov (&DWP(16,"ebp"),"eax");
393 &mov ("eax", $out); # save parameters
396 &lea ($out,&DWP(0,"esp"));
397 &data_byte(0xf3,0xa5); # rep movsl
399 &mov ($out,"eax"); # restore parameters
401 &jmp (&label("${mode}_loop"));
403 &set_label("${mode}_exit"); }
405 &lea ("esp",&DWP(4,"esp")); # popf
406 &emms () if ($mode eq "ctr32");
407 &set_label("${mode}_abort");
408 &function_end("padlock_${mode}_encrypt");
411 &generate_mode("ecb",0xc8);
412 &generate_mode("cbc",0xd0);
413 &generate_mode("cfb",0xe0);
414 &generate_mode("ofb",0xe8);
415 &generate_mode("ctr32",0xc8); # yes, it implements own CTR with ECB opcode,
416 # because hardware CTR was introduced later
417 # and even has errata on certain C7 stepping.
418 # own implementation *always* works, though
419 # ~15% slower than dedicated hardware...
421 &function_begin_B("padlock_xstore");
423 &mov ("edi",&wparam(0));
424 &mov ("edx",&wparam(1));
425 &data_byte(0x0f,0xa7,0xc0); # xstore
428 &function_end_B("padlock_xstore");
430 &function_begin_B("_win32_segv_handler");
431 &mov ("eax",1); # ExceptionContinueSearch
432 &mov ("edx",&wparam(0)); # *ExceptionRecord
433 &mov ("ecx",&wparam(2)); # *ContextRecord
434 &cmp (&DWP(0,"edx"),0xC0000005) # ExceptionRecord->ExceptionCode == STATUS_ACCESS_VIOLATION
435 &jne (&label("ret"));
436 &add (&DWP(184,"ecx"),4); # skip over rep sha*
437 &mov ("eax",0); # ExceptionContinueExecution
440 &function_end_B("_win32_segv_handler");
441 &safeseh("_win32_segv_handler") if ($::win32);
443 &function_begin_B("padlock_sha1_oneshot");
447 &mov ("edi",&wparam(0));
448 &mov ("esi",&wparam(1));
449 &mov ("ecx",&wparam(2));
450 if ($::win32 or $::coff) {
451 &push (&::islabel("_win32_segv_handler"));
452 &data_byte(0x64,0xff,0x30); # push %fs:(%eax)
453 &data_byte(0x64,0x89,0x20); # mov %esp,%fs:(%eax)
455 &mov ("edx","esp"); # put aside %esp
456 &add ("esp",-128); # 32 is enough but spec says 128
457 &movups ("xmm0",&QWP(0,"edi")); # copy-in context
459 &mov ("eax",&DWP(16,"edi"));
460 &movaps (&QWP(0,"esp"),"xmm0");
462 &mov (&DWP(16,"esp"),"eax");
464 &data_byte(0xf3,0x0f,0xa6,0xc8); # rep xsha1
465 &movaps ("xmm0",&QWP(0,"esp"));
466 &mov ("eax",&DWP(16,"esp"));
467 &mov ("esp","edx"); # restore %esp
468 if ($::win32 or $::coff) {
469 &data_byte(0x64,0x8f,0x05,0,0,0,0); # pop %fs:0
470 &lea ("esp",&DWP(4,"esp"));
472 &mov ("edi",&wparam(0));
473 &movups (&QWP(0,"edi"),"xmm0"); # copy-out context
474 &mov (&DWP(16,"edi"),"eax");
478 &function_end_B("padlock_sha1_oneshot");
480 &function_begin_B("padlock_sha1_blocks");
483 &mov ("edi",&wparam(0));
484 &mov ("esi",&wparam(1));
485 &mov ("edx","esp"); # put aside %esp
486 &mov ("ecx",&wparam(2));
488 &movups ("xmm0",&QWP(0,"edi")); # copy-in context
490 &mov ("eax",&DWP(16,"edi"));
491 &movaps (&QWP(0,"esp"),"xmm0");
493 &mov (&DWP(16,"esp"),"eax");
495 &data_byte(0xf3,0x0f,0xa6,0xc8); # rep xsha1
496 &movaps ("xmm0",&QWP(0,"esp"));
497 &mov ("eax",&DWP(16,"esp"));
498 &mov ("esp","edx"); # restore %esp
499 &mov ("edi",&wparam(0));
500 &movups (&QWP(0,"edi"),"xmm0"); # copy-out context
501 &mov (&DWP(16,"edi"),"eax");
505 &function_end_B("padlock_sha1_blocks");
507 &function_begin_B("padlock_sha256_oneshot");
511 &mov ("edi",&wparam(0));
512 &mov ("esi",&wparam(1));
513 &mov ("ecx",&wparam(2));
514 if ($::win32 or $::coff) {
515 &push (&::islabel("_win32_segv_handler"));
516 &data_byte(0x64,0xff,0x30); # push %fs:(%eax)
517 &data_byte(0x64,0x89,0x20); # mov %esp,%fs:(%eax)
519 &mov ("edx","esp"); # put aside %esp
521 &movups ("xmm0",&QWP(0,"edi")); # copy-in context
523 &movups ("xmm1",&QWP(16,"edi"));
524 &movaps (&QWP(0,"esp"),"xmm0");
526 &movaps (&QWP(16,"esp"),"xmm1");
528 &data_byte(0xf3,0x0f,0xa6,0xd0); # rep xsha256
529 &movaps ("xmm0",&QWP(0,"esp"));
530 &movaps ("xmm1",&QWP(16,"esp"));
531 &mov ("esp","edx"); # restore %esp
532 if ($::win32 or $::coff) {
533 &data_byte(0x64,0x8f,0x05,0,0,0,0); # pop %fs:0
534 &lea ("esp",&DWP(4,"esp"));
536 &mov ("edi",&wparam(0));
537 &movups (&QWP(0,"edi"),"xmm0"); # copy-out context
538 &movups (&QWP(16,"edi"),"xmm1");
542 &function_end_B("padlock_sha256_oneshot");
544 &function_begin_B("padlock_sha256_blocks");
547 &mov ("edi",&wparam(0));
548 &mov ("esi",&wparam(1));
549 &mov ("ecx",&wparam(2));
550 &mov ("edx","esp"); # put aside %esp
552 &movups ("xmm0",&QWP(0,"edi")); # copy-in context
554 &movups ("xmm1",&QWP(16,"edi"));
555 &movaps (&QWP(0,"esp"),"xmm0");
557 &movaps (&QWP(16,"esp"),"xmm1");
559 &data_byte(0xf3,0x0f,0xa6,0xd0); # rep xsha256
560 &movaps ("xmm0",&QWP(0,"esp"));
561 &movaps ("xmm1",&QWP(16,"esp"));
562 &mov ("esp","edx"); # restore %esp
563 &mov ("edi",&wparam(0));
564 &movups (&QWP(0,"edi"),"xmm0"); # copy-out context
565 &movups (&QWP(16,"edi"),"xmm1");
569 &function_end_B("padlock_sha256_blocks");
571 &function_begin_B("padlock_sha512_blocks");
574 &mov ("edi",&wparam(0));
575 &mov ("esi",&wparam(1));
576 &mov ("ecx",&wparam(2));
577 &mov ("edx","esp"); # put aside %esp
579 &movups ("xmm0",&QWP(0,"edi")); # copy-in context
581 &movups ("xmm1",&QWP(16,"edi"));
582 &movups ("xmm2",&QWP(32,"edi"));
583 &movups ("xmm3",&QWP(48,"edi"));
584 &movaps (&QWP(0,"esp"),"xmm0");
586 &movaps (&QWP(16,"esp"),"xmm1");
587 &movaps (&QWP(32,"esp"),"xmm2");
588 &movaps (&QWP(48,"esp"),"xmm3");
589 &data_byte(0xf3,0x0f,0xa6,0xe0); # rep xsha512
590 &movaps ("xmm0",&QWP(0,"esp"));
591 &movaps ("xmm1",&QWP(16,"esp"));
592 &movaps ("xmm2",&QWP(32,"esp"));
593 &movaps ("xmm3",&QWP(48,"esp"));
594 &mov ("esp","edx"); # restore %esp
595 &mov ("edi",&wparam(0));
596 &movups (&QWP(0,"edi"),"xmm0"); # copy-out context
597 &movups (&QWP(16,"edi"),"xmm1");
598 &movups (&QWP(32,"edi"),"xmm2");
599 &movups (&QWP(48,"edi"),"xmm3");
603 &function_end_B("padlock_sha512_blocks");
605 &asciz ("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>");
609 # Essentially this variable belongs in thread local storage.
610 # Having this variable global on the other hand can only cause
611 # few bogus key reloads [if any at all on signle-CPU system],
612 # so we accept the penalty...
613 &set_label("padlock_saved_context",4);