e_padlock-x86*.pl: Nano-related update.

author Andy Polyakov <appro@openssl.org>

Tue, 4 Oct 2011 11:05:16 +0000 (11:05 +0000)

committer Andy Polyakov <appro@openssl.org>

Tue, 4 Oct 2011 11:05:16 +0000 (11:05 +0000)
author Andy Polyakov <appro@openssl.org>
Tue, 4 Oct 2011 11:05:16 +0000 (11:05 +0000)
committer Andy Polyakov <appro@openssl.org>
Tue, 4 Oct 2011 11:05:16 +0000 (11:05 +0000)
diff --git a/engines/asm/e_padlock-x86.pl b/engines/asm/e_padlock-x86.pl

index 672d91257a363a010fc2ae7e10bea1db03bfe690..df8f56b5214d7d2b735021443ddd0942936d1bc5 100644 (file)
--- a/engines/asm/e_padlock-x86.pl
+++ b/engines/asm/e_padlock-x86.pl
@@ -15,14 +15,21 @@
  # mode and ~75% in CBC mode. For aligned data improvement can be
  # observed for short inputs only, e.g. 45% for 64-byte messages in
  # ECB mode, 20% in CBC. Difference in performance for aligned vs.
-# misaligned data depends on misalignment and is either ~1.8x or
-# ~2.9x. These are approximately same factors as for hardware support,
-# so there is little reason to rely on the latter. It might actually
-# hurt performance in mixture of aligned and misaligned buffers,
-# because a) if you choose to flip 'align' flag on per-buffer basis,
-# then you'd have to reload key context; b) if you choose to set
-# 'align' flag permanently, it limits performance for aligned data
-# to ~1/2. All results were collected on 1.5GHz C7.
+# misaligned data depends on misalignment and is either ~1.8x or 2.9x.
+# These are approximately same factors as for hardware support, so
+# there is little reason to rely on the latter. On the contrary, it
+# might actually hurt performance in mixture of aligned and misaligned
+# buffers, because a) if you choose to flip 'align' flag in control
+# word on per-buffer basis, then you'd have to reload key context,
+# which incurs penalty; b) if you choose to set 'align' flag
+# permanently, it limits performance even for aligned data to ~1/2.
+# All above mentioned results were collected on 1.5GHz C7. Nano on the
+# other hand handles unaligned data more gracefully. Depending on
+# algorithm and how unaligned data is, hardware can be up to 70% more
+# efficient than below software alignment procedures, nor does 'align'
+# flag have affect on aligned performance [if has any meaning at all].
+# Therefore suggestion is to unconditionally set 'align' flag on Nano
+# for optimal performance.
  
  $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  push(@INC,"${dir}","${dir}../../crypto/perlasm");
@@ -362,7 +369,7 @@ my ($mode,$opcode) = @_;
         &ret    ();
  &function_end_B("padlock_sha1_oneshot");
  
-&function_begin_B("padlock_sha1");
+&function_begin_B("padlock_sha1_blocks");
         &push   ("edi");
         &push   ("esi");
         &mov    ("eax",-1);
@@ -373,7 +380,7 @@ my ($mode,$opcode) = @_;
         &pop    ("esi");
         &pop    ("edi");
         &ret    ();
-&function_end_B("padlock_sha1");
+&function_end_B("padlock_sha1_blocks");
  
  &function_begin_B("padlock_sha256_oneshot");
         &push   ("edi");
@@ -397,7 +404,7 @@ my ($mode,$opcode) = @_;
         &ret    ();
  &function_end_B("padlock_sha256_oneshot");
  
-&function_begin_B("padlock_sha256");
+&function_begin_B("padlock_sha256_blocks");
         &push   ("edi");
         &push   ("esi");
         &mov    ("eax",-1);
@@ -408,7 +415,19 @@ my ($mode,$opcode) = @_;
         &pop    ("esi");
         &pop    ("edi");
         &ret    ();
-&function_end_B("padlock_sha256");
+&function_end_B("padlock_sha256_blocks");
+
+&function_begin_B("padlock_sha512_blocks");
+       &push   ("edi");
+       &push   ("esi");
+       &mov    ("edi",&wparam(0));
+       &mov    ("esi",&wparam(1));
+       &mov    ("ecx",&wparam(2));
+       &data_byte(0xf3,0x0f,0xa6,0xe0);        # rep xsha512
+       &pop    ("esi");
+       &pop    ("edi");
+       &ret    ();
+&function_end_B("padlock_sha512_blocks");
  
  &asciz ("VIA Padlock x86 module, CRYPTOGAMS by <appro\@openssl.org>");
  &align (16);
@@ -417,7 +436,7 @@ my ($mode,$opcode) = @_;
  # Essentially this variable belongs in thread local storage.
  # Having this variable global on the other hand can only cause
  # few bogus key reloads [if any at all on signle-CPU system],
-# so we accept the panalty...
+# so we accept the penalty...
  &set_label("padlock_saved_context",4);
  &data_word(0);
  
diff --git a/engines/asm/e_padlock-x86_64.pl b/engines/asm/e_padlock-x86_64.pl

index f1c040e7f3b9691d7fa9e1920448ae364ace554c..30e17c129c5ffc59b1c672dc9a500b27727e10a6 100644 (file)
--- a/engines/asm/e_padlock-x86_64.pl
+++ b/engines/asm/e_padlock-x86_64.pl
@@ -151,15 +151,15 @@ padlock_sha1_oneshot:
         ret
  .size  padlock_sha1_oneshot,.-padlock_sha1_oneshot
  
-.globl padlock_sha1
-.type  padlock_sha1,\@function,3
+.globl padlock_sha1_blocks
+.type  padlock_sha1_blocks,\@function,3
  .align 16
-padlock_sha1:
+padlock_sha1_blocks:
         mov     \$-1,%rax
         mov     %rdx,%rcx
         .byte   0xf3,0x0f,0xa6,0xc8     # rep xsha1
         ret
-.size  padlock_sha1,.-padlock_sha1
+.size  padlock_sha1_blocks,.-padlock_sha1_blocks
  
  .globl padlock_sha256_oneshot
  .type  padlock_sha256_oneshot,\@function,3
@@ -171,15 +171,23 @@ padlock_sha256_oneshot:
         ret
  .size  padlock_sha256_oneshot,.-padlock_sha256_oneshot
  
-.globl padlock_sha256
-.type  padlock_sha256,\@function,3
+.globl padlock_sha256_blocks
+.type  padlock_sha256_blocks,\@function,3
  .align 16
-padlock_sha256:
+padlock_sha256_blocks:
         mov     \$-1,%rax
         mov     %rdx,%rcx
         .byte   0xf3,0x0f,0xa6,0xd0     # rep xsha256
         ret
-.size  padlock_sha256,.-padlock_sha256
+.size  padlock_sha256_blocks,.-padlock_sha256_blocks
+
+.globl padlock_sha512_blocks,\@function,3
+.align 16
+padlock_sha512_blocks:
+       mov     %rdx,%rcx
+       .byte   0xf3,0x0f,0xa6,0xe0     # rep xha512
+       ret
+.size  padlock_sha512_blocks,.-padlock_sha512_blocks
  ___
  
  sub generate_mode {
@@ -207,6 +215,7 @@ padlock_${mode}_encrypt:
         xor     %eax,%eax
         xor     %ebx,%ebx
         testl   \$`1<<5`,($ctx)         # align bit in control word
+       jnz     .L${mode}_aligned
         test    \$0x0f,$out
         setz    %al                     # !out_misaligned
         test    \$0x0f,$inp
author	Andy Polyakov <appro@openssl.org>
	Tue, 4 Oct 2011 11:05:16 +0000 (11:05 +0000)
committer	Andy Polyakov <appro@openssl.org>
	Tue, 4 Oct 2011 11:05:16 +0000 (11:05 +0000)
engines/asm/e_padlock-x86.pl		patch \| blob \| history
engines/asm/e_padlock-x86_64.pl		patch \| blob \| history