From 34aca2b6b6e65e04f25fdd6dd3a6dc3ddf894940 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sun, 26 Jun 2005 16:25:25 +0000 Subject: [PATCH] IA64 RC4 update from HEAD [see commentary in HEAD for details]. PR: 1114 --- crypto/rc4/Makefile | 6 +++- crypto/rc4/asm/rc4-ia64.S | 65 ++++++++++++++++++++------------------- crypto/rc4/rc4.h | 4 --- crypto/rc4/rc4_enc.c | 4 --- crypto/rc4/rc4_skey.c | 4 --- 5 files changed, 39 insertions(+), 44 deletions(-) diff --git a/crypto/rc4/Makefile b/crypto/rc4/Makefile index 0b2f734e43..c138188360 100644 --- a/crypto/rc4/Makefile +++ b/crypto/rc4/Makefile @@ -69,7 +69,11 @@ asm/rx86unix.cpp: asm/rc4-586.pl ../perlasm/x86asm.pl asm/rc4-x86_64.s: asm/rc4-x86_64.pl; $(PERL) asm/rc4-x86_64.pl $@ asm/rc4-ia64.s: asm/rc4-ia64.S - $(CC) $(CFLAGS) -E asm/rc4-ia64.S > $@ + @case `awk '/^#define RC4_INT/{print$$NF}' $(TOP)/include/openssl/opensslconf.h` in \ + int) set -x; $(CC) $(CFLAGS) -DSZ=4 -E asm/rc4-ia64.S > $@ ;; \ + char) set -x; $(CC) $(CFLAGS) -DSZ=1 -E asm/rc4-ia64.S > $@ ;; \ + *) exit 1 ;; \ + esac files: $(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO diff --git a/crypto/rc4/asm/rc4-ia64.S b/crypto/rc4/asm/rc4-ia64.S index b517d2e88f..a322d0c718 100644 --- a/crypto/rc4/asm/rc4-ia64.S +++ b/crypto/rc4/asm/rc4-ia64.S @@ -7,7 +7,7 @@ // disclaimed. // ==================================================================== -.ident "rc4-ia64.S, Version 1.1" +.ident "rc4-ia64.S, Version 2.0" .ident "IA-64 ISA artwork by Andy Polyakov " // What's wrong with compiler generated code? Because of the nature of @@ -27,17 +27,10 @@ // Legitimate "collisions" do occur within every 256^2 bytes window. // Fortunately there're enough free instruction slots to keep prior // reference to key[x+1], detect "collision" and compensate for it. -// All this without sacrificing a single clock cycle:-) -// Furthermore. In order to compress loop body to the minimum, I chose -// to deploy deposit instruction, which substitutes for the whole -// key->data+((x&255)<data[0]))). This unfortunately -// requires key->data to be aligned at sizeof(key->data) boundary. -// This is why you'll find "RC4_INT pad[512-256-2];" addenum to RC4_KEY -// and "d=(RC4_INT *)(((size_t)(d+255))&~(sizeof(key->data)-1));" in -// rc4_skey.c [and rc4_enc.c, where it's retained for debugging -// purposes]. Throughput is ~210MBps on 900MHz CPU, which is is >3x -// faster than gcc generated code and +30% - if compared to HP-UX C. -// Unrolling loop below should give >30% on top of that... +// All this without sacrificing a single clock cycle:-) Throughput is +// ~210MBps on 900MHz CPU, which is is >3x faster than gcc generated +// code and +30% - if compared to HP-UX C. Unrolling loop below should +// give >30% on top of that... .text .explicit @@ -48,7 +41,9 @@ # define ADDP add #endif +#ifndef SZ #define SZ 4 // this is set to sizeof(RC4_INT) +#endif // SZ==4 seems to be optimal. At least SZ==8 is not any faster, not for // assembler implementation, while SZ==1 code is ~30% slower. #if SZ==1 // RC4_INT is unsigned char @@ -101,45 +96,53 @@ RC4: ADDP out=0,in3 brp.loop.imp .Ltop,.Lexit-16 };; { .mmi; LDKEY yy=[key] // load key->y - add ksch=(255+1)*SZ,key // as ksch will be used with - // deposit instruction only, - // I don't have to &~255... + add ksch=SZ,key mov ar.lc=in1 } { .mmi; mov key_y[1]=r0 // guarantee inequality // in first iteration add xx=1,xx mov pr.rot=1<<16 };; { .mii; nop.m 0 - dep key_x[1]=xx,ksch,OFF,8 + dep key_x[1]=xx,r0,OFF,8 mov ar.ec=3 };; // note that epilogue counter // is off by 1. I compensate // for this at exit... .Ltop: -// The loop is scheduled for 3*(n+2) spin-rate on Itanium 2, which +// The loop is scheduled for 4*(n+2) spin-rate on Itanium 2, which // theoretically gives asymptotic performance of clock frequency -// divided by 3 bytes per seconds, or 500MBps on 1.5GHz CPU. Measured -// performance however is distinctly lower than 1/4:-( The culplrit -// seems to be *(out++)=dat, which inadvertently splits the bundle, -// even though there is M-port available... Unrolling is due... -// Unrolled loop should collect output with variable shift instruction -// in order to avoid starvation for integer shifter... It should be -// possible to get pretty close to theoretical peak... -{ .mmi; (p16) LDKEY tx[0]=[key_x[1]] // tx=key[xx] - (p17) LDKEY ty[0]=[key_y[1]] // ty=key[yy] - (p18) dep rnd[1]=rnd[1],ksch,OFF,8} // &key[(tx+ty)&255] +// divided by 4 bytes per seconds, or 400MBps on 1.6GHz CPU. This is +// for sizeof(RC4_INT)==4. For smaller RC4_INT STKEY inadvertently +// splits the last bundle and you end up with 5*n spin-rate:-( +// Originally the loop was scheduled for 3*n and relied on key +// schedule to be aligned at 256*sizeof(RC4_INT) boundary. But +// *(out++)=dat, which maps to st1, had same effect [inadvertent +// bundle split] and holded the loop back. Rescheduling for 4*n +// made it possible to eliminate dependence on specific alignment +// and allow OpenSSH keep "abusing" our API. Reaching for 3*n would +// require unrolling, sticking to variable shift instruction for +// collecting output [to avoid starvation for integer shifter] and +// copying of key schedule to controlled place in stack [so that +// deposit instruction can serve as substitute for whole +// key->data+((x&255)<data[0])))]... { .mmi; (p19) st1 [out]=dat[3],1 // *(out++)=dat (p16) add xx=1,xx // x++ - (p16) cmp.ne.unc p20,p21=key_x[1],key_y[1] };; + (p18) dep rnd[1]=rnd[1],r0,OFF,8 } // ((tx+ty)&255)<x; y=key->y; d=key->data; -#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64) - /* see crypto/rc4/asm/rc4-ia64.S for further details... */ - d=(RC4_INT *)(((size_t)(d+255))&~(sizeof(key->data)-1)); -#endif #if defined(RC4_CHUNK) /* diff --git a/crypto/rc4/rc4_skey.c b/crypto/rc4/rc4_skey.c index 0fca2e1a52..60510624fd 100644 --- a/crypto/rc4/rc4_skey.c +++ b/crypto/rc4/rc4_skey.c @@ -95,10 +95,6 @@ FIPS_NON_FIPS_VCIPHER_Init(RC4) unsigned int i; d= &(key->data[0]); -#if defined(__ia64) || defined(__ia64__) || defined(_M_IA64) - /* see crypto/rc4/asm/rc4-ia64.S for further details... */ - d=(RC4_INT *)(((size_t)(d+255))&~(sizeof(key->data)-1)); -#endif for (i=0; i<256; i++) d[i]=i; -- 2.25.1