From b7b46c9a87c9fe7275a84c5ecb9f5f3459d7b307 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Tue, 30 Nov 2004 15:46:46 +0000 Subject: [PATCH] Add 0.9.7 specific comments to RC4 assembler modules. --- crypto/rc4/asm/rc4-586.pl | 15 ++++++++++----- crypto/rc4/asm/rc4-amd64.pl | 4 +++- crypto/rc4/asm/rc4-ia64.S | 2 +- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/crypto/rc4/asm/rc4-586.pl b/crypto/rc4/asm/rc4-586.pl index 977a9f1237..07b2bc6fcd 100644 --- a/crypto/rc4/asm/rc4-586.pl +++ b/crypto/rc4/asm/rc4-586.pl @@ -1,7 +1,7 @@ #!/usr/local/bin/perl # At some point it became apparent that the original SSLeay RC4 -# assembler implementation performs suboptimal on latest IA-32 +# assembler implementation performs suboptimaly on latest IA-32 # microarchitectures. After re-tuning performance has changed as # following: # @@ -15,10 +15,12 @@ # In other words code performing further 13% faster on AMD # would perform almost 2 times slower on Intel PIII... # For reference! This code delivers ~80% of rc4-amd64.pl -# performance on same Opteron machine. +# performance on the same Opteron machine. # (**) This number requires compressed key schedule set up by -# RC4_set_key, see commentary section in rc4_skey.c for -# further details. +# RC4_set_key and therefore doesn't apply to 0.9.7 [option for +# compressed key schedule is implemented in 0.9.8 and later, +# see commentary section in rc4_skey.c for further details]. +# # push(@INC,"perlasm","../../perlasm"); @@ -130,6 +132,8 @@ sub RC4 &add( $d, 8); # detect compressed schedule, see commentary section in rc4_skey.c... + # in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant, + # as compressed key schedule is set up in 0.9.8 and later. &cmp(&DWP(256,$d),-1); &je(&label("RC4_CHAR")); @@ -190,7 +194,8 @@ sub RC4 &jmp(&label("finished")); &align(16); - # this is essentially Intel P4 specific codepath, see rc4_skey.c... + # this is essentially Intel P4 specific codepath, see rc4_skey.c, + # and is engaged in 0.9.8 and later context... &set_label("RC4_CHAR"); &lea ($ty,&DWP(0,$in,$ty)); diff --git a/crypto/rc4/asm/rc4-amd64.pl b/crypto/rc4/asm/rc4-amd64.pl index 35e426d561..9e0da8af99 100755 --- a/crypto/rc4/asm/rc4-amd64.pl +++ b/crypto/rc4/asm/rc4-amd64.pl @@ -30,7 +30,9 @@ # RC4_CHAR. Kind of ironic, huh? As it's apparently impossible to # compose blended code, which would perform even within 30% marginal # on either AMD and Intel platforms, I implement both cases. See -# rc4_skey.c for further details... +# rc4_skey.c for further details... This applies to 0.9.8 and later. +# In 0.9.7 context RC4_CHAR codepath is never engaged and ~70 bytes +# of code remain redundant. $output=shift; diff --git a/crypto/rc4/asm/rc4-ia64.S b/crypto/rc4/asm/rc4-ia64.S index 4af7fba7b3..ae84af6729 100644 --- a/crypto/rc4/asm/rc4-ia64.S +++ b/crypto/rc4/asm/rc4-ia64.S @@ -18,7 +18,7 @@ // to input and output streams. Secondly, less obvious, it's possible // to pull up some references to elements of the key schedule itself. // Fact is that such prior loads are not safe only for "degenerated" -// key schedule, when all elements equal to the same value, which is +// key schedule, when some elements equal to the same value, which is // never the case [key schedule setup routine makes sure it's not]. // Furthermore. In order to compress loop body to the minimum, I chose // to deploy deposit instruction, which substitutes for the whole -- 2.25.1