# forms are granted according to the OpenSSL license.
# ====================================================================
#
-# Version 3.0.
+# Version 3.1.
#
# You might fail to appreciate this module performance from the first
# try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
# Instruction Level Parallelism, and it indeed resulted in up to 15%
# better performance on most recent µ-archs...
#
-# Current ECB performance numbers for 128-bit key in cycles per byte
-# [measure commonly used by AES benchmarkers] are:
+# Current ECB performance numbers for 128-bit key in CPU cycles per
+# processed byte [measure commonly used by AES benchmarkers] are:
#
# small footprint fully unrolled
# P4[-3] 23[24] 22[23]
# AMD K8 19 18
-# PIII 26(*) 23
+# PIII 26 23
# Pentium 63(*) 52
#
# (*) Performance difference between small footprint code and fully
-# unrolled in more commonly used CBC mode is not as big, 7% for
-# PIII and 15% for Pentium, which I consider tolerable.
+# unrolled in more commonly used CBC mode is not as big, 4% for
+# for Pentium. PIII's ~13% difference [in both cases in 3rd
+# version] is considered tolerable...
#
# Third version adds AES_cbc_encrypt implementation, which resulted in
-# up to 40% performance imrovement of CBC benchmark results [on most
-# recent µ-archs]. CBC performance is virtually as good as ECB now and
-# sometimes even better, because function prologues and epilogues are
+# up to 40% performance imrovement of CBC benchmark results. 40% was
+# observed on P4 core, where "overall" imrovement coefficient, i.e. if
+# compared to PIC generated by GCC and in CBC mode, was observed to be
+# as large as 4x:-) CBC performance is virtually identical to ECB now
+# and on some platforms even better, e.g. 56 "small" cycles/byte on
+# senior Pentium, because certain function prologues and epilogues are
# effectively taken out of the loop...
push(@INC,"perlasm","../../perlasm");
$small_footprint=1; # $small_footprint=1 code is ~5% slower [on
# recent µ-archs], but ~5 times smaller!
- # I favor compact code, because it minimizes
- # cache contention...
+ # I favor compact code to minimize cache
+ # contention and in hope to "collect" 5% back
+ # in real-life applications...
$vertical_spin=0; # shift "verticaly" defaults to 0, because of
# its proof-of-concept status...
&push ($key eq "edi" ? $key : ""); # push ivp
&pushf ();
&mov ($key,&wparam(1)); # load out
- &xor ($s0,$s0);
- &mov (&DWP(0,$key),$s0); # zero output
- &mov (&DWP(4,$key),$s0);
- &mov (&DWP(8,$key),$s0);
- &mov (&DWP(12,$key),$s0);
+ &mov ($s1,16);
+ &sub ($s1,$s2);
+ &cmp ($key,$acc); # compare with inp
+ &je (&label("enc_in_place"));
&data_word(0x90A4F3FC); # cld; rep movsb; nop # copy input
+ &jmp (&label("enc_skip_in_place"));
+ &set_label("enc_in_place");
+ &lea ($key,&DWP(0,$key,$s2));
+ &set_label("enc_skip_in_place");
+ &mov ($s2,$s1);
+ &xor ($s0,$s0);
+ &data_word(0x90AAF3FC); # cld; rep stosb; nop # zero tail
&popf ();
&pop ($key); # pop ivp
&pushf ();
&data_word(0x90A4F3FC); # cld; rep movsb; nop # restore tail
&popf ();
+
+ &align (4);
&set_label("dec_out");
&stack_pop(5);
&function_end("AES_cbc_encrypt");