Bug-fix in CBC encrypt tail processing and commentary section update.

author Andy Polyakov <appro@openssl.org>

Thu, 20 Jan 2005 10:33:37 +0000 (10:33 +0000)

committer Andy Polyakov <appro@openssl.org>

Thu, 20 Jan 2005 10:33:37 +0000 (10:33 +0000)
author Andy Polyakov <appro@openssl.org>
Thu, 20 Jan 2005 10:33:37 +0000 (10:33 +0000)
committer Andy Polyakov <appro@openssl.org>
Thu, 20 Jan 2005 10:33:37 +0000 (10:33 +0000)
diff --git a/crypto/aes/asm/aes-586.pl b/crypto/aes/asm/aes-586.pl

index 4263a7e1dca210c45485f311bbf5a1dbc881f561..ee02ded463c0a71e5d57f8f2d98e6fe13ee578b2 100755 (executable)
--- a/crypto/aes/asm/aes-586.pl
+++ b/crypto/aes/asm/aes-586.pl
@@ -6,7 +6,7 @@
  # forms are granted according to the OpenSSL license.
  # ====================================================================
  #
-# Version 3.0.
+# Version 3.1.
  #
  # You might fail to appreciate this module performance from the first
  # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
@@ -46,23 +46,27 @@
  # Instruction Level Parallelism, and it indeed resulted in up to 15%
  # better performance on most recent µ-archs...
  #
-# Current ECB performance numbers for 128-bit key in cycles per byte
-# [measure commonly used by AES benchmarkers] are:
+# Current ECB performance numbers for 128-bit key in CPU cycles per
+# processed byte [measure commonly used by AES benchmarkers] are:
  #
  #              small footprint         fully unrolled
  # P4[-3]       23[24]                  22[23]
  # AMD K8       19                      18
-# PIII         26(*)                   23
+# PIII         26                      23
  # Pentium      63(*)                   52
  #
  # (*)  Performance difference between small footprint code and fully
-#      unrolled in more commonly used CBC mode is not as big, 7% for
-#      PIII and 15% for Pentium, which I consider tolerable.
+#      unrolled in more commonly used CBC mode is not as big, 4% for
+#      for Pentium. PIII's ~13% difference [in both cases in 3rd
+#      version] is considered tolerable...
  #
  # Third version adds AES_cbc_encrypt implementation, which resulted in
-# up to 40% performance imrovement of CBC benchmark results [on most
-# recent µ-archs]. CBC performance is virtually as good as ECB now and
-# sometimes even better, because function prologues and epilogues are
+# up to 40% performance imrovement of CBC benchmark results. 40% was
+# observed on P4 core, where "overall" imrovement coefficient, i.e. if
+# compared to PIC generated by GCC and in CBC mode, was observed to be
+# as large as 4x:-) CBC performance is virtually identical to ECB now
+# and on some platforms even better, e.g. 56 "small" cycles/byte on
+# senior Pentium, because certain function prologues and epilogues are
  # effectively taken out of the loop...
  
  push(@INC,"perlasm","../../perlasm");
@@ -79,8 +83,9 @@ $acc="esi";
  
  $small_footprint=1;    # $small_footprint=1 code is ~5% slower [on
                         # recent µ-archs], but ~5 times smaller!
-                       # I favor compact code, because it minimizes
-                       # cache contention...
+                       # I favor compact code to minimize cache
+                       # contention and in hope to "collect" 5% back
+                       # in real-life applications...
  $vertical_spin=0;      # shift "verticaly" defaults to 0, because of
                         # its proof-of-concept status...
  
@@ -1296,12 +1301,18 @@ sub declast()
         &push   ($key eq "edi" ? $key : "");    # push ivp
         &pushf  ();
         &mov    ($key,&wparam(1));              # load out
-       &xor    ($s0,$s0);
-       &mov    (&DWP(0,$key),$s0);             # zero output
-       &mov    (&DWP(4,$key),$s0);
-       &mov    (&DWP(8,$key),$s0);
-       &mov    (&DWP(12,$key),$s0);
+       &mov    ($s1,16);
+       &sub    ($s1,$s2);
+       &cmp    ($key,$acc);                    # compare with inp
+       &je     (&label("enc_in_place"));
         &data_word(0x90A4F3FC); # cld; rep movsb; nop   # copy input
+       &jmp    (&label("enc_skip_in_place"));
+    &set_label("enc_in_place");
+       &lea    ($key,&DWP(0,$key,$s2));
+    &set_label("enc_skip_in_place");
+       &mov    ($s2,$s1);
+       &xor    ($s0,$s0);
+       &data_word(0x90AAF3FC); # cld; rep stosb; nop   # zero tail
         &popf   ();
         &pop    ($key);                         # pop ivp
  
@@ -1456,6 +1467,8 @@ sub declast()
         &pushf  ();
         &data_word(0x90A4F3FC); # cld; rep movsb; nop   # restore tail
         &popf   ();
+
+    &align     (4);
      &set_label("dec_out");
      &stack_pop(5);
  &function_end("AES_cbc_encrypt");
author	Andy Polyakov <appro@openssl.org>
	Thu, 20 Jan 2005 10:33:37 +0000 (10:33 +0000)
committer	Andy Polyakov <appro@openssl.org>
	Thu, 20 Jan 2005 10:33:37 +0000 (10:33 +0000)