From 28feb1f8da3839543137906c74870168c76a2757 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Mon, 23 Jul 2007 14:15:36 +0000 Subject: [PATCH] md32_common.h update from HEAD. --- crypto/md32_common.h | 373 +++--------- crypto/md4/md4_dgst.c | 89 --- crypto/md4/md4_locl.h | 44 -- crypto/md5/asm/md5-586.pl | 2 +- crypto/md5/asm/md5-sparcv9.S | 1031 --------------------------------- crypto/md5/asm/md5-x86_64.pl | 8 +- crypto/md5/md5_dgst.c | 106 ---- crypto/md5/md5_locl.h | 54 +- crypto/ripemd/asm/rmd-586.pl | 4 +- crypto/ripemd/rmd_dgst.c | 201 ------- crypto/ripemd/rmd_locl.h | 16 +- crypto/sha/asm/sha1-586.pl | 433 ++++---------- crypto/sha/asm/sha1-ia64.pl | 346 ++--------- crypto/sha/asm/sha512-ia64.pl | 422 +++++++++++--- crypto/sha/sha256.c | 67 +-- crypto/sha/sha512.c | 100 +++- crypto/sha/sha_locl.h | 278 ++------- 17 files changed, 723 insertions(+), 2851 deletions(-) delete mode 100644 crypto/md5/asm/md5-sparcv9.S diff --git a/crypto/md32_common.h b/crypto/md32_common.h index 0e625a8e55..089c450290 100644 --- a/crypto/md32_common.h +++ b/crypto/md32_common.h @@ -1,6 +1,6 @@ /* crypto/md32_common.h */ /* ==================================================================== - * Copyright (c) 1999-2002 The OpenSSL Project. All rights reserved. + * Copyright (c) 1999-2007 The OpenSSL Project. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -47,10 +47,6 @@ * OF THE POSSIBILITY OF SUCH DAMAGE. * ==================================================================== * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). - * */ /* @@ -76,40 +72,27 @@ * typedef struct { * ... * HASH_LONG Nl,Nh; + * either { * HASH_LONG data[HASH_LBLOCK]; + * unsigned char data[HASH_CBLOCK]; + * }; * unsigned int num; * ... * } HASH_CTX; + * data[] vector is expected to be zeroed upon first call to + * HASH_UPDATE. * HASH_UPDATE * name of "Update" function, implemented here. * HASH_TRANSFORM * name of "Transform" function, implemented here. * HASH_FINAL * name of "Final" function, implemented here. - * HASH_BLOCK_HOST_ORDER - * name of "block" function treating *aligned* input message - * in host byte order, implemented externally. * HASH_BLOCK_DATA_ORDER - * name of "block" function treating *unaligned* input message - * in original (data) byte order, implemented externally (it - * actually is optional if data and host are of the same - * "endianess"). + * name of "block" function capable of treating *unaligned* input + * message in original (data) byte order, implemented externally. * HASH_MAKE_STRING * macro convering context variables to an ASCII hash string. * - * Optional macros: - * - * B_ENDIAN or L_ENDIAN - * defines host byte-order. - * HASH_LONG_LOG2 - * defaults to 2 if not states otherwise. - * HASH_LBLOCK - * assumed to be HASH_CBLOCK/4 if not stated otherwise. - * HASH_BLOCK_DATA_ORDER_ALIGNED - * alternative "block" function capable of treating - * aligned input message in original (data) order, - * implemented externally. - * * MD5 example: * * #define DATA_ORDER_IS_LITTLE_ENDIAN @@ -118,11 +101,9 @@ * #define HASH_LONG_LOG2 MD5_LONG_LOG2 * #define HASH_CTX MD5_CTX * #define HASH_CBLOCK MD5_CBLOCK - * #define HASH_LBLOCK MD5_LBLOCK * #define HASH_UPDATE MD5_Update * #define HASH_TRANSFORM MD5_Transform * #define HASH_FINAL MD5_Final - * #define HASH_BLOCK_HOST_ORDER md5_block_host_order * #define HASH_BLOCK_DATA_ORDER md5_block_data_order * * @@ -152,27 +133,9 @@ #error "HASH_FINAL must be defined!" #endif -#ifndef HASH_BLOCK_HOST_ORDER -#error "HASH_BLOCK_HOST_ORDER must be defined!" -#endif - -#if 0 -/* - * Moved below as it's required only if HASH_BLOCK_DATA_ORDER_ALIGNED - * isn't defined. - */ #ifndef HASH_BLOCK_DATA_ORDER #error "HASH_BLOCK_DATA_ORDER must be defined!" #endif -#endif - -#ifndef HASH_LBLOCK -#define HASH_LBLOCK (HASH_CBLOCK/4) -#endif - -#ifndef HASH_LONG_LOG2 -#define HASH_LONG_LOG2 2 -#endif /* * Engage compiler specific rotate intrinsic function if available. @@ -206,7 +169,8 @@ : "cc"); \ ret; \ }) -# elif defined(__powerpc) || defined(__ppc__) || defined(__powerpc64__) +# elif defined(_ARCH_PPC) || defined(_ARCH_PPC64) || \ + defined(__powerpc) || defined(__ppc__) || defined(__powerpc64__) # define ROTATE(a,n) ({ register unsigned int ret; \ asm ( \ "rlwinm %0,%1,%2,0,31" \ @@ -214,80 +178,28 @@ : "r"(a), "I"(n)); \ ret; \ }) +# elif defined(__s390x__) +# define ROTATE(a,n) ({ register unsigned int ret; \ + asm ("rll %0,%1,%2" \ + : "=r"(ret) \ + : "r"(a), "I"(n)); \ + ret; \ + }) # endif # endif #endif /* PEDANTIC */ -#if HASH_LONG_LOG2==2 /* Engage only if sizeof(HASH_LONG)== 4 */ -/* A nice byte order reversal from Wei Dai */ -#ifdef ROTATE -/* 5 instructions with rotate instruction, else 9 */ -#define REVERSE_FETCH32(a,l) ( \ - l=*(const HASH_LONG *)(a), \ - ((ROTATE(l,8)&0x00FF00FF)|(ROTATE((l&0x00FF00FF),24))) \ - ) -#else -/* 6 instructions with rotate instruction, else 8 */ -#define REVERSE_FETCH32(a,l) ( \ - l=*(const HASH_LONG *)(a), \ - l=(((l>>8)&0x00FF00FF)|((l&0x00FF00FF)<<8)), \ - ROTATE(l,16) \ - ) -/* - * Originally the middle line started with l=(((l&0xFF00FF00)>>8)|... - * It's rewritten as above for two reasons: - * - RISCs aren't good at long constants and have to explicitely - * compose 'em with several (well, usually 2) instructions in a - * register before performing the actual operation and (as you - * already realized:-) having same constant should inspire the - * compiler to permanently allocate the only register for it; - * - most modern CPUs have two ALUs, but usually only one has - * circuitry for shifts:-( this minor tweak inspires compiler - * to schedule shift instructions in a better way... - * - * - */ -#endif -#endif - #ifndef ROTATE #define ROTATE(a,n) (((a)<<(n))|(((a)&0xffffffff)>>(32-(n)))) #endif -/* - * Make some obvious choices. E.g., HASH_BLOCK_DATA_ORDER_ALIGNED - * and HASH_BLOCK_HOST_ORDER ought to be the same if input data - * and host are of the same "endianess". It's possible to mask - * this with blank #define HASH_BLOCK_DATA_ORDER though... - * - * - */ -#if defined(B_ENDIAN) -# if defined(DATA_ORDER_IS_BIG_ENDIAN) -# if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_LONG_LOG2==2 -# define HASH_BLOCK_DATA_ORDER_ALIGNED HASH_BLOCK_HOST_ORDER -# endif -# endif -#elif defined(L_ENDIAN) -# if defined(DATA_ORDER_IS_LITTLE_ENDIAN) -# if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_LONG_LOG2==2 -# define HASH_BLOCK_DATA_ORDER_ALIGNED HASH_BLOCK_HOST_ORDER -# endif -# endif -#endif - -#if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED) -#ifndef HASH_BLOCK_DATA_ORDER -#error "HASH_BLOCK_DATA_ORDER must be defined!" -#endif -#endif - #if defined(DATA_ORDER_IS_BIG_ENDIAN) #ifndef PEDANTIC # if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) # if ((defined(__i386) || defined(__i386__)) && !defined(I386_ONLY)) || \ (defined(__x86_64) || defined(__x86_64__)) +# if !defined(B_ENDIAN) /* * This gives ~30-40% performance improvement in SHA-256 compiled * with gcc [on P4]. Well, first macro to be frank. We can pull @@ -300,9 +212,14 @@ # define HOST_l2c(l,c) ({ unsigned int r=(l); \ asm ("bswapl %0":"=r"(r):"0"(r)); \ *((unsigned int *)(c))=r; (c)+=4; r; }) +# endif # endif # endif #endif +#if defined(__s390__) || defined(__s390x__) +# define HOST_c2l(c,l) ((l)=*((const unsigned int *)(c)), (c)+=4, (l)) +# define HOST_l2c(l,c) (*((unsigned int *)(c))=(l), (c)+=4, (l)) +#endif #ifndef HOST_c2l #define HOST_c2l(c,l) (l =(((unsigned long)(*((c)++)))<<24), \ @@ -311,29 +228,6 @@ l|=(((unsigned long)(*((c)++))) ), \ l) #endif -#define HOST_p_c2l(c,l,n) { \ - switch (n) { \ - case 0: l =((unsigned long)(*((c)++)))<<24; \ - case 1: l|=((unsigned long)(*((c)++)))<<16; \ - case 2: l|=((unsigned long)(*((c)++)))<< 8; \ - case 3: l|=((unsigned long)(*((c)++))); \ - } } -#define HOST_p_c2l_p(c,l,sc,len) { \ - switch (sc) { \ - case 0: l =((unsigned long)(*((c)++)))<<24; \ - if (--len == 0) break; \ - case 1: l|=((unsigned long)(*((c)++)))<<16; \ - if (--len == 0) break; \ - case 2: l|=((unsigned long)(*((c)++)))<< 8; \ - } } -/* NOTE the pointer is not incremented at the end of this */ -#define HOST_c2l_p(c,l,n) { \ - l=0; (c)+=n; \ - switch (n) { \ - case 3: l =((unsigned long)(*(--(c))))<< 8; \ - case 2: l|=((unsigned long)(*(--(c))))<<16; \ - case 1: l|=((unsigned long)(*(--(c))))<<24; \ - } } #ifndef HOST_l2c #define HOST_l2c(l,c) (*((c)++)=(unsigned char)(((l)>>24)&0xff), \ *((c)++)=(unsigned char)(((l)>>16)&0xff), \ @@ -344,6 +238,18 @@ #elif defined(DATA_ORDER_IS_LITTLE_ENDIAN) +#ifndef PEDANTIC +# if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) +# if defined(__s390x__) +# define HOST_c2l(c,l) ({ asm ("lrv %0,0(%1)" \ + :"=r"(l) : "r"(c)); \ + (c)+=4; (l); }) +# define HOST_l2c(l,c) ({ asm ("strv %0,0(%1)" \ + : : "r"(l),"r"(c) : "memory"); \ + (c)+=4; (l); }) +# endif +# endif +#endif #if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__) # ifndef B_ENDIAN /* See comment in DATA_ORDER_IS_BIG_ENDIAN section. */ @@ -359,29 +265,6 @@ l|=(((unsigned long)(*((c)++)))<<24), \ l) #endif -#define HOST_p_c2l(c,l,n) { \ - switch (n) { \ - case 0: l =((unsigned long)(*((c)++))); \ - case 1: l|=((unsigned long)(*((c)++)))<< 8; \ - case 2: l|=((unsigned long)(*((c)++)))<<16; \ - case 3: l|=((unsigned long)(*((c)++)))<<24; \ - } } -#define HOST_p_c2l_p(c,l,sc,len) { \ - switch (sc) { \ - case 0: l =((unsigned long)(*((c)++))); \ - if (--len == 0) break; \ - case 1: l|=((unsigned long)(*((c)++)))<< 8; \ - if (--len == 0) break; \ - case 2: l|=((unsigned long)(*((c)++)))<<16; \ - } } -/* NOTE the pointer is not incremented at the end of this */ -#define HOST_c2l_p(c,l,n) { \ - l=0; (c)+=n; \ - switch (n) { \ - case 3: l =((unsigned long)(*(--(c))))<<16; \ - case 2: l|=((unsigned long)(*(--(c))))<< 8; \ - case 1: l|=((unsigned long)(*(--(c)))); \ - } } #ifndef HOST_l2c #define HOST_l2c(l,c) (*((c)++)=(unsigned char)(((l) )&0xff), \ *((c)++)=(unsigned char)(((l)>> 8)&0xff), \ @@ -399,9 +282,9 @@ int HASH_UPDATE (HASH_CTX *c, const void *data_, size_t len) { const unsigned char *data=data_; - register HASH_LONG * p; - register HASH_LONG l; - size_t sw,sc,ew,ec; + unsigned char *p; + HASH_LONG l; + size_t n; if (len==0) return 1; @@ -413,101 +296,43 @@ int HASH_UPDATE (HASH_CTX *c, const void *data_, size_t len) c->Nh+=(len>>29); /* might cause compiler warning on 16-bit */ c->Nl=l; - if (c->num != 0) + n = c->num; + if (n != 0) { - p=c->data; - sw=c->num>>2; - sc=c->num&0x03; + p=(unsigned char *)c->data; - if ((c->num+len) >= HASH_CBLOCK) + if ((n+len) >= HASH_CBLOCK) { - l=p[sw]; HOST_p_c2l(data,l,sc); p[sw++]=l; - for (; swnum); - c->num=0; - /* drop through and do the rest */ + memcpy (p+n,data,HASH_CBLOCK-n); + HASH_BLOCK_DATA_ORDER (c,p,1); + n = HASH_CBLOCK-n; + data += n; + len -= n; + c->num = 0; + memset (p,0,HASH_CBLOCK); /* keep it zeroed */ } else { - c->num+=(unsigned int)len; - if ((sc+len) < 4) /* ugly, add char's to a word */ - { - l=p[sw]; HOST_p_c2l_p(data,l,sc,len); p[sw]=l; - } - else - { - ew=(c->num>>2); - ec=(c->num&0x03); - if (sc) - l=p[sw]; - HOST_p_c2l(data,l,sc); - p[sw++]=l; - for (; sw < ew; sw++) - { - HOST_c2l(data,l); p[sw]=l; - } - if (ec) - { - HOST_c2l_p(data,l,ec); p[sw]=l; - } - } + memcpy (p+n,data,len); + c->num += (unsigned int)len; return 1; } } - sw=len/HASH_CBLOCK; - if (sw > 0) + n = len/HASH_CBLOCK; + if (n > 0) { -#if defined(HASH_BLOCK_DATA_ORDER_ALIGNED) - /* - * Note that HASH_BLOCK_DATA_ORDER_ALIGNED gets defined - * only if sizeof(HASH_LONG)==4. - */ - if ((((size_t)data)%4) == 0) - { - /* data is properly aligned so that we can cast it: */ - HASH_BLOCK_DATA_ORDER_ALIGNED (c,(const HASH_LONG *)data,sw); - sw*=HASH_CBLOCK; - data+=sw; - len-=sw; - } - else -#if !defined(HASH_BLOCK_DATA_ORDER) - while (sw--) - { - memcpy (p=c->data,data,HASH_CBLOCK); - HASH_BLOCK_DATA_ORDER_ALIGNED(c,p,1); - data+=HASH_CBLOCK; - len-=HASH_CBLOCK; - } -#endif -#endif -#if defined(HASH_BLOCK_DATA_ORDER) - { - HASH_BLOCK_DATA_ORDER(c,data,sw); - sw*=HASH_CBLOCK; - data+=sw; - len-=sw; - } -#endif + HASH_BLOCK_DATA_ORDER (c,data,n); + n *= HASH_CBLOCK; + data += n; + len -= n; } - if (len!=0) + if (len != 0) { - p = c->data; + p = (unsigned char *)c->data; c->num = len; - ew=len>>2; /* words to copy */ - ec=len&0x03; - for (; ew; ew--,p++) - { - HOST_c2l(data,l); *p=l; - } - HOST_c2l_p(data,l,ec); - *p=l; + memcpy (p,data,len); } return 1; } @@ -515,73 +340,38 @@ int HASH_UPDATE (HASH_CTX *c, const void *data_, size_t len) void HASH_TRANSFORM (HASH_CTX *c, const unsigned char *data) { -#if defined(HASH_BLOCK_DATA_ORDER_ALIGNED) - if ((((size_t)data)%4) == 0) - /* data is properly aligned so that we can cast it: */ - HASH_BLOCK_DATA_ORDER_ALIGNED (c,(const HASH_LONG *)data,1); - else -#if !defined(HASH_BLOCK_DATA_ORDER) - { - memcpy (c->data,data,HASH_CBLOCK); - HASH_BLOCK_DATA_ORDER_ALIGNED (c,c->data,1); - } -#endif -#endif -#if defined(HASH_BLOCK_DATA_ORDER) HASH_BLOCK_DATA_ORDER (c,data,1); -#endif } int HASH_FINAL (unsigned char *md, HASH_CTX *c) { - register HASH_LONG *p; - register unsigned long l; - register int i,j; - static const unsigned char end[4]={0x80,0x00,0x00,0x00}; - const unsigned char *cp=end; - - /* c->num should definitly have room for at least one more byte. */ - p=c->data; - i=c->num>>2; - j=c->num&0x03; - -#if 0 - /* purify often complains about the following line as an - * Uninitialized Memory Read. While this can be true, the - * following p_c2l macro will reset l when that case is true. - * This is because j&0x03 contains the number of 'valid' bytes - * already in p[i]. If and only if j&0x03 == 0, the UMR will - * occur but this is also the only time p_c2l will do - * l= *(cp++) instead of l|= *(cp++) - * Many thanks to Alex Tang for pickup this - * 'potential bug' */ -#ifdef PURIFY - if (j==0) p[i]=0; /* Yeah, but that's not the way to fix it:-) */ -#endif - l=p[i]; -#else - l = (j==0) ? 0 : p[i]; -#endif - HOST_p_c2l(cp,l,j); p[i++]=l; /* i is the next 'undefined word' */ + unsigned char *p = (unsigned char *)c->data; + size_t n = c->num; - if (i>(HASH_LBLOCK-2)) /* save room for Nl and Nh */ + p[n] = 0x80; /* there is always room for one */ + n++; + + if (n > (HASH_CBLOCK-8)) { - if (iNh; - p[HASH_LBLOCK-1]=c->Nl; + (void)HOST_l2c(c->Nh,p); + (void)HOST_l2c(c->Nl,p); #elif defined(DATA_ORDER_IS_LITTLE_ENDIAN) - p[HASH_LBLOCK-2]=c->Nl; - p[HASH_LBLOCK-1]=c->Nh; + (void)HOST_l2c(c->Nl,p); + (void)HOST_l2c(c->Nh,p); #endif - HASH_BLOCK_HOST_ORDER (c,p,1); + p -= HASH_CBLOCK; + HASH_BLOCK_DATA_ORDER (c,p,1); + c->num=0; + memset (p,0,HASH_CBLOCK); #ifndef HASH_MAKE_STRING #error "HASH_MAKE_STRING must be defined!" @@ -589,11 +379,6 @@ int HASH_FINAL (unsigned char *md, HASH_CTX *c) HASH_MAKE_STRING(c,md); #endif - c->num=0; - /* clear stuff, HASH_BLOCK may be leaving some stuff on the stack - * but I'm not worried :-) - OPENSSL_cleanse((void *)c,sizeof(HASH_CTX)); - */ return 1; } diff --git a/crypto/md4/md4_dgst.c b/crypto/md4/md4_dgst.c index 63ecac392a..62fb7e6677 100644 --- a/crypto/md4/md4_dgst.c +++ b/crypto/md4/md4_dgst.c @@ -84,79 +84,6 @@ FIPS_NON_FIPS_MD_Init(MD4) return 1; } -#ifndef md4_block_host_order -void md4_block_host_order (MD4_CTX *c, const void *data, size_t num) - { - const MD4_LONG *X=data; - register unsigned MD32_REG_T A,B,C,D; - - A=c->A; - B=c->B; - C=c->C; - D=c->D; - - for (;num--;X+=HASH_LBLOCK) - { - /* Round 0 */ - R0(A,B,C,D,X[ 0], 3,0); - R0(D,A,B,C,X[ 1], 7,0); - R0(C,D,A,B,X[ 2],11,0); - R0(B,C,D,A,X[ 3],19,0); - R0(A,B,C,D,X[ 4], 3,0); - R0(D,A,B,C,X[ 5], 7,0); - R0(C,D,A,B,X[ 6],11,0); - R0(B,C,D,A,X[ 7],19,0); - R0(A,B,C,D,X[ 8], 3,0); - R0(D,A,B,C,X[ 9], 7,0); - R0(C,D,A,B,X[10],11,0); - R0(B,C,D,A,X[11],19,0); - R0(A,B,C,D,X[12], 3,0); - R0(D,A,B,C,X[13], 7,0); - R0(C,D,A,B,X[14],11,0); - R0(B,C,D,A,X[15],19,0); - /* Round 1 */ - R1(A,B,C,D,X[ 0], 3,0x5A827999L); - R1(D,A,B,C,X[ 4], 5,0x5A827999L); - R1(C,D,A,B,X[ 8], 9,0x5A827999L); - R1(B,C,D,A,X[12],13,0x5A827999L); - R1(A,B,C,D,X[ 1], 3,0x5A827999L); - R1(D,A,B,C,X[ 5], 5,0x5A827999L); - R1(C,D,A,B,X[ 9], 9,0x5A827999L); - R1(B,C,D,A,X[13],13,0x5A827999L); - R1(A,B,C,D,X[ 2], 3,0x5A827999L); - R1(D,A,B,C,X[ 6], 5,0x5A827999L); - R1(C,D,A,B,X[10], 9,0x5A827999L); - R1(B,C,D,A,X[14],13,0x5A827999L); - R1(A,B,C,D,X[ 3], 3,0x5A827999L); - R1(D,A,B,C,X[ 7], 5,0x5A827999L); - R1(C,D,A,B,X[11], 9,0x5A827999L); - R1(B,C,D,A,X[15],13,0x5A827999L); - /* Round 2 */ - R2(A,B,C,D,X[ 0], 3,0x6ED9EBA1); - R2(D,A,B,C,X[ 8], 9,0x6ED9EBA1); - R2(C,D,A,B,X[ 4],11,0x6ED9EBA1); - R2(B,C,D,A,X[12],15,0x6ED9EBA1); - R2(A,B,C,D,X[ 2], 3,0x6ED9EBA1); - R2(D,A,B,C,X[10], 9,0x6ED9EBA1); - R2(C,D,A,B,X[ 6],11,0x6ED9EBA1); - R2(B,C,D,A,X[14],15,0x6ED9EBA1); - R2(A,B,C,D,X[ 1], 3,0x6ED9EBA1); - R2(D,A,B,C,X[ 9], 9,0x6ED9EBA1); - R2(C,D,A,B,X[ 5],11,0x6ED9EBA1); - R2(B,C,D,A,X[13],15,0x6ED9EBA1); - R2(A,B,C,D,X[ 3], 3,0x6ED9EBA1); - R2(D,A,B,C,X[11], 9,0x6ED9EBA1); - R2(C,D,A,B,X[ 7],11,0x6ED9EBA1); - R2(B,C,D,A,X[15],15,0x6ED9EBA1); - - A = c->A += A; - B = c->B += B; - C = c->C += C; - D = c->D += D; - } - } -#endif - #ifndef md4_block_data_order #ifdef X #undef X @@ -242,19 +169,3 @@ void md4_block_data_order (MD4_CTX *c, const void *data_, size_t num) } } #endif - -#ifdef undef -int printit(unsigned long *l) - { - int i,ii; - - for (i=0; i<2; i++) - { - for (ii=0; ii<8; ii++) - { - fprintf(stderr,"%08lx ",l[i*8+ii]); - } - fprintf(stderr,"\n"); - } - } -#endif diff --git a/crypto/md4/md4_locl.h b/crypto/md4/md4_locl.h index abc7b9bb84..c8085b0ead 100644 --- a/crypto/md4/md4_locl.h +++ b/crypto/md4/md4_locl.h @@ -65,43 +65,13 @@ #define MD4_LONG_LOG2 2 /* default to 32 bits */ #endif -void md4_block_host_order (MD4_CTX *c, const void *p,size_t num); void md4_block_data_order (MD4_CTX *c, const void *p,size_t num); -#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__INTEL__) -# if !defined(B_ENDIAN) -/* - * *_block_host_order is expected to handle aligned data while - * *_block_data_order - unaligned. As algorithm and host (x86) - * are in this case of the same "endianness" these two are - * otherwise indistinguishable. But normally you don't want to - * call the same function because unaligned access in places - * where alignment is expected is usually a "Bad Thing". Indeed, - * on RISCs you get punished with BUS ERROR signal or *severe* - * performance degradation. Intel CPUs are in turn perfectly - * capable of loading unaligned data without such drastic side - * effect. Yes, they say it's slower than aligned load, but no - * exception is generated and therefore performance degradation - * is *incomparable* with RISCs. What we should weight here is - * costs of unaligned access against costs of aligning data. - * According to my measurements allowing unaligned access results - * in ~9% performance improvement on Pentium II operating at - * 266MHz. I won't be surprised if the difference will be higher - * on faster systems:-) - * - * - */ -# define md4_block_data_order md4_block_host_order -# endif -#endif - #define DATA_ORDER_IS_LITTLE_ENDIAN #define HASH_LONG MD4_LONG -#define HASH_LONG_LOG2 MD4_LONG_LOG2 #define HASH_CTX MD4_CTX #define HASH_CBLOCK MD4_CBLOCK -#define HASH_LBLOCK MD4_LBLOCK #define HASH_UPDATE MD4_Update #define HASH_TRANSFORM MD4_Transform #define HASH_FINAL MD4_Final @@ -112,21 +82,7 @@ void md4_block_data_order (MD4_CTX *c, const void *p,size_t num); ll=(c)->C; HOST_l2c(ll,(s)); \ ll=(c)->D; HOST_l2c(ll,(s)); \ } while (0) -#define HASH_BLOCK_HOST_ORDER md4_block_host_order -#if !defined(L_ENDIAN) || defined(md4_block_data_order) #define HASH_BLOCK_DATA_ORDER md4_block_data_order -/* - * Little-endians (Intel and Alpha) feel better without this. - * It looks like memcpy does better job than generic - * md4_block_data_order on copying-n-aligning input data. - * But frankly speaking I didn't expect such result on Alpha. - * On the other hand I've got this with egcs-1.0.2 and if - * program is compiled with another (better?) compiler it - * might turn out other way around. - * - * - */ -#endif #include "md32_common.h" diff --git a/crypto/md5/asm/md5-586.pl b/crypto/md5/asm/md5-586.pl index fa3fa3bed5..76ac235f7d 100644 --- a/crypto/md5/asm/md5-586.pl +++ b/crypto/md5/asm/md5-586.pl @@ -29,7 +29,7 @@ $X="esi"; 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9, # R3 ); -&md5_block("md5_block_asm_host_order"); +&md5_block("md5_block_asm_data_order"); &asm_finish(); sub Np diff --git a/crypto/md5/asm/md5-sparcv9.S b/crypto/md5/asm/md5-sparcv9.S deleted file mode 100644 index db45aa4c97..0000000000 --- a/crypto/md5/asm/md5-sparcv9.S +++ /dev/null @@ -1,1031 +0,0 @@ -.ident "md5-sparcv9.S, Version 1.0" -.ident "SPARC V9 ISA artwork by Andy Polyakov " -.file "md5-sparcv9.S" - -/* - * ==================================================================== - * Copyright (c) 1999 Andy Polyakov . - * - * Rights for redistribution and usage in source and binary forms are - * granted as long as above copyright notices are retained. Warranty - * of any kind is (of course:-) disclaimed. - * ==================================================================== - */ - -/* - * This is my modest contribution to OpenSSL project (see - * http://www.openssl.org/ for more information about it) and is an - * assembler implementation of MD5 block hash function. I've hand-coded - * this for the sole reason to reach UltraSPARC-specific "load in - * little-endian byte order" instruction. This gives up to 15% - * performance improvement for cases when input message is aligned at - * 32 bits boundary. The module was tested under both 32 *and* 64 bit - * kernels. For updates see http://fy.chalmers.se/~appro/hpe/. - * - * To compile with SC4.x/SC5.x: - * - * cc -xarch=v[9|8plus] -DOPENSSL_SYSNAME_ULTRASPARC -DMD5_BLOCK_DATA_ORDER \ - * -c md5-sparcv9.S - * - * and with gcc: - * - * gcc -mcpu=ultrasparc -DOPENSSL_SYSNAME_ULTRASPARC -DMD5_BLOCK_DATA_ORDER \ - * -c md5-sparcv9.S - * - * or if above fails (it does if you have gas): - * - * gcc -E -DOPENSSL_SYSNAMEULTRASPARC -DMD5_BLOCK_DATA_ORDER md5_block.sparc.S | \ - * as -xarch=v8plus /dev/fd/0 -o md5-sparcv9.o - */ - -#include - -#define A %o0 -#define B %o1 -#define C %o2 -#define D %o3 -#define T1 %o4 -#define T2 %o5 - -#define R0 %l0 -#define R1 %l1 -#define R2 %l2 -#define R3 %l3 -#define R4 %l4 -#define R5 %l5 -#define R6 %l6 -#define R7 %l7 -#define R8 %i3 -#define R9 %i4 -#define R10 %i5 -#define R11 %g1 -#define R12 %g2 -#define R13 %g3 -#define RX %g4 - -#define Aptr %i0+0 -#define Bptr %i0+4 -#define Cptr %i0+8 -#define Dptr %i0+12 - -#define Aval R5 /* those not used at the end of the last round */ -#define Bval R6 -#define Cval R7 -#define Dval R8 - -#if defined(MD5_BLOCK_DATA_ORDER) -# if defined(OPENSSL_SYSNAME_ULTRASPARC) -# define LOAD lda -# define X(i) [%i1+i*4]%asi -# define md5_block md5_block_asm_data_order_aligned -# define ASI_PRIMARY_LITTLE 0x88 -# else -# error "MD5_BLOCK_DATA_ORDER is supported only on UltraSPARC!" -# endif -#else -# define LOAD ld -# define X(i) [%i1+i*4] -# define md5_block md5_block_asm_host_order -#endif - -.section ".text",#alloc,#execinstr - -#if defined(__SUNPRO_C) && defined(__sparcv9) - /* They've said -xarch=v9 at command line */ - .register %g2,#scratch - .register %g3,#scratch -# define FRAME -192 -#elif defined(__GNUC__) && defined(__arch64__) - /* They've said -m64 at command line */ - .register %g2,#scratch - .register %g3,#scratch -# define FRAME -192 -#else -# define FRAME -96 -#endif - -.align 32 - -.global md5_block -md5_block: - save %sp,FRAME,%sp - - ld [Dptr],D - ld [Cptr],C - ld [Bptr],B - ld [Aptr],A -#ifdef ASI_PRIMARY_LITTLE - rd %asi,%o7 ! How dare I? Well, I just do:-) - wr %g0,ASI_PRIMARY_LITTLE,%asi -#endif - LOAD X(0),R0 - -.Lmd5_block_loop: - -!!!!!!!!Round 0 - - xor C,D,T1 - sethi %hi(0xd76aa478),T2 - and T1,B,T1 - or T2,%lo(0xd76aa478),T2 != - xor T1,D,T1 - add T1,R0,T1 - LOAD X(1),R1 - add T1,T2,T1 != - add A,T1,A - sll A,7,T2 - srl A,32-7,A - or A,T2,A != - xor B,C,T1 - add A,B,A - - sethi %hi(0xe8c7b756),T2 - and T1,A,T1 != - or T2,%lo(0xe8c7b756),T2 - xor T1,C,T1 - LOAD X(2),R2 - add T1,R1,T1 != - add T1,T2,T1 - add D,T1,D - sll D,12,T2 - srl D,32-12,D != - or D,T2,D - xor A,B,T1 - add D,A,D - - sethi %hi(0x242070db),T2 != - and T1,D,T1 - or T2,%lo(0x242070db),T2 - xor T1,B,T1 - add T1,R2,T1 != - LOAD X(3),R3 - add T1,T2,T1 - add C,T1,C - sll C,17,T2 != - srl C,32-17,C - or C,T2,C - xor D,A,T1 - add C,D,C != - - sethi %hi(0xc1bdceee),T2 - and T1,C,T1 - or T2,%lo(0xc1bdceee),T2 - xor T1,A,T1 != - add T1,R3,T1 - LOAD X(4),R4 - add T1,T2,T1 - add B,T1,B != - sll B,22,T2 - srl B,32-22,B - or B,T2,B - xor C,D,T1 != - add B,C,B - - sethi %hi(0xf57c0faf),T2 - and T1,B,T1 - or T2,%lo(0xf57c0faf),T2 != - xor T1,D,T1 - add T1,R4,T1 - LOAD X(5),R5 - add T1,T2,T1 != - add A,T1,A - sll A,7,T2 - srl A,32-7,A - or A,T2,A != - xor B,C,T1 - add A,B,A - - sethi %hi(0x4787c62a),T2 - and T1,A,T1 != - or T2,%lo(0x4787c62a),T2 - xor T1,C,T1 - LOAD X(6),R6 - add T1,R5,T1 != - add T1,T2,T1 - add D,T1,D - sll D,12,T2 - srl D,32-12,D != - or D,T2,D - xor A,B,T1 - add D,A,D - - sethi %hi(0xa8304613),T2 != - and T1,D,T1 - or T2,%lo(0xa8304613),T2 - xor T1,B,T1 - add T1,R6,T1 != - LOAD X(7),R7 - add T1,T2,T1 - add C,T1,C - sll C,17,T2 != - srl C,32-17,C - or C,T2,C - xor D,A,T1 - add C,D,C != - - sethi %hi(0xfd469501),T2 - and T1,C,T1 - or T2,%lo(0xfd469501),T2 - xor T1,A,T1 != - add T1,R7,T1 - LOAD X(8),R8 - add T1,T2,T1 - add B,T1,B != - sll B,22,T2 - srl B,32-22,B - or B,T2,B - xor C,D,T1 != - add B,C,B - - sethi %hi(0x698098d8),T2 - and T1,B,T1 - or T2,%lo(0x698098d8),T2 != - xor T1,D,T1 - add T1,R8,T1 - LOAD X(9),R9 - add T1,T2,T1 != - add A,T1,A - sll A,7,T2 - srl A,32-7,A - or A,T2,A != - xor B,C,T1 - add A,B,A - - sethi %hi(0x8b44f7af),T2 - and T1,A,T1 != - or T2,%lo(0x8b44f7af),T2 - xor T1,C,T1 - LOAD X(10),R10 - add T1,R9,T1 != - add T1,T2,T1 - add D,T1,D - sll D,12,T2 - srl D,32-12,D != - or D,T2,D - xor A,B,T1 - add D,A,D - - sethi %hi(0xffff5bb1),T2 != - and T1,D,T1 - or T2,%lo(0xffff5bb1),T2 - xor T1,B,T1 - add T1,R10,T1 != - LOAD X(11),R11 - add T1,T2,T1 - add C,T1,C - sll C,17,T2 != - srl C,32-17,C - or C,T2,C - xor D,A,T1 - add C,D,C != - - sethi %hi(0x895cd7be),T2 - and T1,C,T1 - or T2,%lo(0x895cd7be),T2 - xor T1,A,T1 != - add T1,R11,T1 - LOAD X(12),R12 - add T1,T2,T1 - add B,T1,B != - sll B,22,T2 - srl B,32-22,B - or B,T2,B - xor C,D,T1 != - add B,C,B - - sethi %hi(0x6b901122),T2 - and T1,B,T1 - or T2,%lo(0x6b901122),T2 != - xor T1,D,T1 - add T1,R12,T1 - LOAD X(13),R13 - add T1,T2,T1 != - add A,T1,A - sll A,7,T2 - srl A,32-7,A - or A,T2,A != - xor B,C,T1 - add A,B,A - - sethi %hi(0xfd987193),T2 - and T1,A,T1 != - or T2,%lo(0xfd987193),T2 - xor T1,C,T1 - LOAD X(14),RX - add T1,R13,T1 != - add T1,T2,T1 - add D,T1,D - sll D,12,T2 - srl D,32-12,D != - or D,T2,D - xor A,B,T1 - add D,A,D - - sethi %hi(0xa679438e),T2 != - and T1,D,T1 - or T2,%lo(0xa679438e),T2 - xor T1,B,T1 - add T1,RX,T1 != - LOAD X(15),RX - add T1,T2,T1 - add C,T1,C - sll C,17,T2 != - srl C,32-17,C - or C,T2,C - xor D,A,T1 - add C,D,C != - - sethi %hi(0x49b40821),T2 - and T1,C,T1 - or T2,%lo(0x49b40821),T2 - xor T1,A,T1 != - add T1,RX,T1 - !pre-LOADed X(1),R1 - add T1,T2,T1 - add B,T1,B - sll B,22,T2 != - srl B,32-22,B - or B,T2,B - add B,C,B - -!!!!!!!!Round 1 - - xor B,C,T1 != - sethi %hi(0xf61e2562),T2 - and T1,D,T1 - or T2,%lo(0xf61e2562),T2 - xor T1,C,T1 != - add T1,R1,T1 - !pre-LOADed X(6),R6 - add T1,T2,T1 - add A,T1,A - sll A,5,T2 != - srl A,32-5,A - or A,T2,A - add A,B,A - - xor A,B,T1 != - sethi %hi(0xc040b340),T2 - and T1,C,T1 - or T2,%lo(0xc040b340),T2 - xor T1,B,T1 != - add T1,R6,T1 - !pre-LOADed X(11),R11 - add T1,T2,T1 - add D,T1,D - sll D,9,T2 != - srl D,32-9,D - or D,T2,D - add D,A,D - - xor D,A,T1 != - sethi %hi(0x265e5a51),T2 - and T1,B,T1 - or T2,%lo(0x265e5a51),T2 - xor T1,A,T1 != - add T1,R11,T1 - !pre-LOADed X(0),R0 - add T1,T2,T1 - add C,T1,C - sll C,14,T2 != - srl C,32-14,C - or C,T2,C - add C,D,C - - xor C,D,T1 != - sethi %hi(0xe9b6c7aa),T2 - and T1,A,T1 - or T2,%lo(0xe9b6c7aa),T2 - xor T1,D,T1 != - add T1,R0,T1 - !pre-LOADed X(5),R5 - add T1,T2,T1 - add B,T1,B - sll B,20,T2 != - srl B,32-20,B - or B,T2,B - add B,C,B - - xor B,C,T1 != - sethi %hi(0xd62f105d),T2 - and T1,D,T1 - or T2,%lo(0xd62f105d),T2 - xor T1,C,T1 != - add T1,R5,T1 - !pre-LOADed X(10),R10 - add T1,T2,T1 - add A,T1,A - sll A,5,T2 != - srl A,32-5,A - or A,T2,A - add A,B,A - - xor A,B,T1 != - sethi %hi(0x02441453),T2 - and T1,C,T1 - or T2,%lo(0x02441453),T2 - xor T1,B,T1 != - add T1,R10,T1 - LOAD X(15),RX - add T1,T2,T1 - add D,T1,D != - sll D,9,T2 - srl D,32-9,D - or D,T2,D - add D,A,D != - - xor D,A,T1 - sethi %hi(0xd8a1e681),T2 - and T1,B,T1 - or T2,%lo(0xd8a1e681),T2 != - xor T1,A,T1 - add T1,RX,T1 - !pre-LOADed X(4),R4 - add T1,T2,T1 - add C,T1,C != - sll C,14,T2 - srl C,32-14,C - or C,T2,C - add C,D,C != - - xor C,D,T1 - sethi %hi(0xe7d3fbc8),T2 - and T1,A,T1 - or T2,%lo(0xe7d3fbc8),T2 != - xor T1,D,T1 - add T1,R4,T1 - !pre-LOADed X(9),R9 - add T1,T2,T1 - add B,T1,B != - sll B,20,T2 - srl B,32-20,B - or B,T2,B - add B,C,B != - - xor B,C,T1 - sethi %hi(0x21e1cde6),T2 - and T1,D,T1 - or T2,%lo(0x21e1cde6),T2 != - xor T1,C,T1 - add T1,R9,T1 - LOAD X(14),RX - add T1,T2,T1 != - add A,T1,A - sll A,5,T2 - srl A,32-5,A - or A,T2,A != - add A,B,A - - xor A,B,T1 - sethi %hi(0xc33707d6),T2 - and T1,C,T1 != - or T2,%lo(0xc33707d6),T2 - xor T1,B,T1 - add T1,RX,T1 - !pre-LOADed X(3),R3 - add T1,T2,T1 != - add D,T1,D - sll D,9,T2 - srl D,32-9,D - or D,T2,D != - add D,A,D - - xor D,A,T1 - sethi %hi(0xf4d50d87),T2 - and T1,B,T1 != - or T2,%lo(0xf4d50d87),T2 - xor T1,A,T1 - add T1,R3,T1 - !pre-LOADed X(8),R8 - add T1,T2,T1 != - add C,T1,C - sll C,14,T2 - srl C,32-14,C - or C,T2,C != - add C,D,C - - xor C,D,T1 - sethi %hi(0x455a14ed),T2 - and T1,A,T1 != - or T2,%lo(0x455a14ed),T2 - xor T1,D,T1 - add T1,R8,T1 - !pre-LOADed X(13),R13 - add T1,T2,T1 != - add B,T1,B - sll B,20,T2 - srl B,32-20,B - or B,T2,B != - add B,C,B - - xor B,C,T1 - sethi %hi(0xa9e3e905),T2 - and T1,D,T1 != - or T2,%lo(0xa9e3e905),T2 - xor T1,C,T1 - add T1,R13,T1 - !pre-LOADed X(2),R2 - add T1,T2,T1 != - add A,T1,A - sll A,5,T2 - srl A,32-5,A - or A,T2,A != - add A,B,A - - xor A,B,T1 - sethi %hi(0xfcefa3f8),T2 - and T1,C,T1 != - or T2,%lo(0xfcefa3f8),T2 - xor T1,B,T1 - add T1,R2,T1 - !pre-LOADed X(7),R7 - add T1,T2,T1 != - add D,T1,D - sll D,9,T2 - srl D,32-9,D - or D,T2,D != - add D,A,D - - xor D,A,T1 - sethi %hi(0x676f02d9),T2 - and T1,B,T1 != - or T2,%lo(0x676f02d9),T2 - xor T1,A,T1 - add T1,R7,T1 - !pre-LOADed X(12),R12 - add T1,T2,T1 != - add C,T1,C - sll C,14,T2 - srl C,32-14,C - or C,T2,C != - add C,D,C - - xor C,D,T1 - sethi %hi(0x8d2a4c8a),T2 - and T1,A,T1 != - or T2,%lo(0x8d2a4c8a),T2 - xor T1,D,T1 - add T1,R12,T1 - !pre-LOADed X(5),R5 - add T1,T2,T1 != - add B,T1,B - sll B,20,T2 - srl B,32-20,B - or B,T2,B != - add B,C,B - -!!!!!!!!Round 2 - - xor B,C,T1 - sethi %hi(0xfffa3942),T2 - xor T1,D,T1 != - or T2,%lo(0xfffa3942),T2 - add T1,R5,T1 - !pre-LOADed X(8),R8 - add T1,T2,T1 - add A,T1,A != - sll A,4,T2 - srl A,32-4,A - or A,T2,A - add A,B,A != - - xor A,B,T1 - sethi %hi(0x8771f681),T2 - xor T1,C,T1 - or T2,%lo(0x8771f681),T2 != - add T1,R8,T1 - !pre-LOADed X(11),R11 - add T1,T2,T1 - add D,T1,D - sll D,11,T2 != - srl D,32-11,D - or D,T2,D - add D,A,D - - xor D,A,T1 != - sethi %hi(0x6d9d6122),T2 - xor T1,B,T1 - or T2,%lo(0x6d9d6122),T2 - add T1,R11,T1 != - LOAD X(14),RX - add T1,T2,T1 - add C,T1,C - sll C,16,T2 != - srl C,32-16,C - or C,T2,C - add C,D,C - - xor C,D,T1 != - sethi %hi(0xfde5380c),T2 - xor T1,A,T1 - or T2,%lo(0xfde5380c),T2 - add T1,RX,T1 != - !pre-LOADed X(1),R1 - add T1,T2,T1 - add B,T1,B - sll B,23,T2 - srl B,32-23,B != - or B,T2,B - add B,C,B - - xor B,C,T1 - sethi %hi(0xa4beea44),T2 != - xor T1,D,T1 - or T2,%lo(0xa4beea44),T2 - add T1,R1,T1 - !pre-LOADed X(4),R4 - add T1,T2,T1 != - add A,T1,A - sll A,4,T2 - srl A,32-4,A - or A,T2,A != - add A,B,A - - xor A,B,T1 - sethi %hi(0x4bdecfa9),T2 - xor T1,C,T1 != - or T2,%lo(0x4bdecfa9),T2 - add T1,R4,T1 - !pre-LOADed X(7),R7 - add T1,T2,T1 - add D,T1,D != - sll D,11,T2 - srl D,32-11,D - or D,T2,D - add D,A,D != - - xor D,A,T1 - sethi %hi(0xf6bb4b60),T2 - xor T1,B,T1 - or T2,%lo(0xf6bb4b60),T2 != - add T1,R7,T1 - !pre-LOADed X(10),R10 - add T1,T2,T1 - add C,T1,C - sll C,16,T2 != - srl C,32-16,C - or C,T2,C - add C,D,C - - xor C,D,T1 != - sethi %hi(0xbebfbc70),T2 - xor T1,A,T1 - or T2,%lo(0xbebfbc70),T2 - add T1,R10,T1 != - !pre-LOADed X(13),R13 - add T1,T2,T1 - add B,T1,B - sll B,23,T2 - srl B,32-23,B != - or B,T2,B - add B,C,B - - xor B,C,T1 - sethi %hi(0x289b7ec6),T2 != - xor T1,D,T1 - or T2,%lo(0x289b7ec6),T2 - add T1,R13,T1 - !pre-LOADed X(0),R0 - add T1,T2,T1 != - add A,T1,A - sll A,4,T2 - srl A,32-4,A - or A,T2,A != - add A,B,A - - xor A,B,T1 - sethi %hi(0xeaa127fa),T2 - xor T1,C,T1 != - or T2,%lo(0xeaa127fa),T2 - add T1,R0,T1 - !pre-LOADed X(3),R3 - add T1,T2,T1 - add D,T1,D != - sll D,11,T2 - srl D,32-11,D - or D,T2,D - add D,A,D != - - xor D,A,T1 - sethi %hi(0xd4ef3085),T2 - xor T1,B,T1 - or T2,%lo(0xd4ef3085),T2 != - add T1,R3,T1 - !pre-LOADed X(6),R6 - add T1,T2,T1 - add C,T1,C - sll C,16,T2 != - srl C,32-16,C - or C,T2,C - add C,D,C - - xor C,D,T1 != - sethi %hi(0x04881d05),T2 - xor T1,A,T1 - or T2,%lo(0x04881d05),T2 - add T1,R6,T1 != - !pre-LOADed X(9),R9 - add T1,T2,T1 - add B,T1,B - sll B,23,T2 - srl B,32-23,B != - or B,T2,B - add B,C,B - - xor B,C,T1 - sethi %hi(0xd9d4d039),T2 != - xor T1,D,T1 - or T2,%lo(0xd9d4d039),T2 - add T1,R9,T1 - !pre-LOADed X(12),R12 - add T1,T2,T1 != - add A,T1,A - sll A,4,T2 - srl A,32-4,A - or A,T2,A != - add A,B,A - - xor A,B,T1 - sethi %hi(0xe6db99e5),T2 - xor T1,C,T1 != - or T2,%lo(0xe6db99e5),T2 - add T1,R12,T1 - LOAD X(15),RX - add T1,T2,T1 != - add D,T1,D - sll D,11,T2 - srl D,32-11,D - or D,T2,D != - add D,A,D - - xor D,A,T1 - sethi %hi(0x1fa27cf8),T2 - xor T1,B,T1 != - or T2,%lo(0x1fa27cf8),T2 - add T1,RX,T1 - !pre-LOADed X(2),R2 - add T1,T2,T1 - add C,T1,C != - sll C,16,T2 - srl C,32-16,C - or C,T2,C - add C,D,C != - - xor C,D,T1 - sethi %hi(0xc4ac5665),T2 - xor T1,A,T1 - or T2,%lo(0xc4ac5665),T2 != - add T1,R2,T1 - !pre-LOADed X(0),R0 - add T1,T2,T1 - add B,T1,B - sll B,23,T2 != - srl B,32-23,B - or B,T2,B - add B,C,B - -!!!!!!!!Round 3 - - orn B,D,T1 != - sethi %hi(0xf4292244),T2 - xor T1,C,T1 - or T2,%lo(0xf4292244),T2 - add T1,R0,T1 != - !pre-LOADed X(7),R7 - add T1,T2,T1 - add A,T1,A - sll A,6,T2 - srl A,32-6,A != - or A,T2,A - add A,B,A - - orn A,C,T1 - sethi %hi(0x432aff97),T2 != - xor T1,B,T1 - or T2,%lo(0x432aff97),T2 - LOAD X(14),RX - add T1,R7,T1 != - add T1,T2,T1 - add D,T1,D - sll D,10,T2 - srl D,32-10,D != - or D,T2,D - add D,A,D - - orn D,B,T1 - sethi %hi(0xab9423a7),T2 != - xor T1,A,T1 - or T2,%lo(0xab9423a7),T2 - add T1,RX,T1 - !pre-LOADed X(5),R5 - add T1,T2,T1 != - add C,T1,C - sll C,15,T2 - srl C,32-15,C - or C,T2,C != - add C,D,C - - orn C,A,T1 - sethi %hi(0xfc93a039),T2 - xor T1,D,T1 != - or T2,%lo(0xfc93a039),T2 - add T1,R5,T1 - !pre-LOADed X(12),R12 - add T1,T2,T1 - add B,T1,B != - sll B,21,T2 - srl B,32-21,B - or B,T2,B - add B,C,B != - - orn B,D,T1 - sethi %hi(0x655b59c3),T2 - xor T1,C,T1 - or T2,%lo(0x655b59c3),T2 != - add T1,R12,T1 - !pre-LOADed X(3),R3 - add T1,T2,T1 - add A,T1,A - sll A,6,T2 != - srl A,32-6,A - or A,T2,A - add A,B,A - - orn A,C,T1 != - sethi %hi(0x8f0ccc92),T2 - xor T1,B,T1 - or T2,%lo(0x8f0ccc92),T2 - add T1,R3,T1 != - !pre-LOADed X(10),R10 - add T1,T2,T1 - add D,T1,D - sll D,10,T2 - srl D,32-10,D != - or D,T2,D - add D,A,D - - orn D,B,T1 - sethi %hi(0xffeff47d),T2 != - xor T1,A,T1 - or T2,%lo(0xffeff47d),T2 - add T1,R10,T1 - !pre-LOADed X(1),R1 - add T1,T2,T1 != - add C,T1,C - sll C,15,T2 - srl C,32-15,C - or C,T2,C != - add C,D,C - - orn C,A,T1 - sethi %hi(0x85845dd1),T2 - xor T1,D,T1 != - or T2,%lo(0x85845dd1),T2 - add T1,R1,T1 - !pre-LOADed X(8),R8 - add T1,T2,T1 - add B,T1,B != - sll B,21,T2 - srl B,32-21,B - or B,T2,B - add B,C,B != - - orn B,D,T1 - sethi %hi(0x6fa87e4f),T2 - xor T1,C,T1 - or T2,%lo(0x6fa87e4f),T2 != - add T1,R8,T1 - LOAD X(15),RX - add T1,T2,T1 - add A,T1,A != - sll A,6,T2 - srl A,32-6,A - or A,T2,A - add A,B,A != - - orn A,C,T1 - sethi %hi(0xfe2ce6e0),T2 - xor T1,B,T1 - or T2,%lo(0xfe2ce6e0),T2 != - add T1,RX,T1 - !pre-LOADed X(6),R6 - add T1,T2,T1 - add D,T1,D - sll D,10,T2 != - srl D,32-10,D - or D,T2,D - add D,A,D - - orn D,B,T1 != - sethi %hi(0xa3014314),T2 - xor T1,A,T1 - or T2,%lo(0xa3014314),T2 - add T1,R6,T1 != - !pre-LOADed X(13),R13 - add T1,T2,T1 - add C,T1,C - sll C,15,T2 - srl C,32-15,C != - or C,T2,C - add C,D,C - - orn C,A,T1 - sethi %hi(0x4e0811a1),T2 != - xor T1,D,T1 - or T2,%lo(0x4e0811a1),T2 - !pre-LOADed X(4),R4 - ld [Aptr],Aval - add T1,R13,T1 != - add T1,T2,T1 - add B,T1,B - sll B,21,T2 - srl B,32-21,B != - or B,T2,B - add B,C,B - - orn B,D,T1 - sethi %hi(0xf7537e82),T2 != - xor T1,C,T1 - or T2,%lo(0xf7537e82),T2 - !pre-LOADed X(11),R11 - ld [Dptr],Dval - add T1,R4,T1 != - add T1,T2,T1 - add A,T1,A - sll A,6,T2 - srl A,32-6,A != - or A,T2,A - add A,B,A - - orn A,C,T1 - sethi %hi(0xbd3af235),T2 != - xor T1,B,T1 - or T2,%lo(0xbd3af235),T2 - !pre-LOADed X(2),R2 - ld [Cptr],Cval - add T1,R11,T1 != - add T1,T2,T1 - add D,T1,D - sll D,10,T2 - srl D,32-10,D != - or D,T2,D - add D,A,D - - orn D,B,T1 - sethi %hi(0x2ad7d2bb),T2 != - xor T1,A,T1 - or T2,%lo(0x2ad7d2bb),T2 - !pre-LOADed X(9),R9 - ld [Bptr],Bval - add T1,R2,T1 != - add Aval,A,Aval - add T1,T2,T1 - st Aval,[Aptr] - add C,T1,C != - sll C,15,T2 - add Dval,D,Dval - srl C,32-15,C - or C,T2,C != - st Dval,[Dptr] - add C,D,C - - orn C,A,T1 - sethi %hi(0xeb86d391),T2 != - xor T1,D,T1 - or T2,%lo(0xeb86d391),T2 - add T1,R9,T1 - !pre-LOADed X(0),R0 - mov Aval,A != - add T1,T2,T1 - mov Dval,D - add B,T1,B - sll B,21,T2 != - add Cval,C,Cval - srl B,32-21,B - st Cval,[Cptr] - or B,T2,B != - add B,C,B - - deccc %i2 - mov Cval,C - add B,Bval,B != - inc 64,%i1 - nop - st B,[Bptr] - nop != - -#ifdef OPENSSL_SYSNAME_ULTRASPARC - bg,a,pt %icc,.Lmd5_block_loop -#else - bg,a .Lmd5_block_loop -#endif - LOAD X(0),R0 - -#ifdef ASI_PRIMARY_LITTLE - wr %g0,%o7,%asi -#endif - ret - restore %g0,0,%o0 - -.type md5_block,#function -.size md5_block,(.-md5_block) diff --git a/crypto/md5/asm/md5-x86_64.pl b/crypto/md5/asm/md5-x86_64.pl index c36a7febf7..9a6fa67224 100755 --- a/crypto/md5/asm/md5-x86_64.pl +++ b/crypto/md5/asm/md5-x86_64.pl @@ -111,9 +111,9 @@ $code .= <A; - B=c->B; - C=c->C; - D=c->D; - - for (;num--;X+=HASH_LBLOCK) - { - /* Round 0 */ - R0(A,B,C,D,X[ 0], 7,0xd76aa478L); - R0(D,A,B,C,X[ 1],12,0xe8c7b756L); - R0(C,D,A,B,X[ 2],17,0x242070dbL); - R0(B,C,D,A,X[ 3],22,0xc1bdceeeL); - R0(A,B,C,D,X[ 4], 7,0xf57c0fafL); - R0(D,A,B,C,X[ 5],12,0x4787c62aL); - R0(C,D,A,B,X[ 6],17,0xa8304613L); - R0(B,C,D,A,X[ 7],22,0xfd469501L); - R0(A,B,C,D,X[ 8], 7,0x698098d8L); - R0(D,A,B,C,X[ 9],12,0x8b44f7afL); - R0(C,D,A,B,X[10],17,0xffff5bb1L); - R0(B,C,D,A,X[11],22,0x895cd7beL); - R0(A,B,C,D,X[12], 7,0x6b901122L); - R0(D,A,B,C,X[13],12,0xfd987193L); - R0(C,D,A,B,X[14],17,0xa679438eL); - R0(B,C,D,A,X[15],22,0x49b40821L); - /* Round 1 */ - R1(A,B,C,D,X[ 1], 5,0xf61e2562L); - R1(D,A,B,C,X[ 6], 9,0xc040b340L); - R1(C,D,A,B,X[11],14,0x265e5a51L); - R1(B,C,D,A,X[ 0],20,0xe9b6c7aaL); - R1(A,B,C,D,X[ 5], 5,0xd62f105dL); - R1(D,A,B,C,X[10], 9,0x02441453L); - R1(C,D,A,B,X[15],14,0xd8a1e681L); - R1(B,C,D,A,X[ 4],20,0xe7d3fbc8L); - R1(A,B,C,D,X[ 9], 5,0x21e1cde6L); - R1(D,A,B,C,X[14], 9,0xc33707d6L); - R1(C,D,A,B,X[ 3],14,0xf4d50d87L); - R1(B,C,D,A,X[ 8],20,0x455a14edL); - R1(A,B,C,D,X[13], 5,0xa9e3e905L); - R1(D,A,B,C,X[ 2], 9,0xfcefa3f8L); - R1(C,D,A,B,X[ 7],14,0x676f02d9L); - R1(B,C,D,A,X[12],20,0x8d2a4c8aL); - /* Round 2 */ - R2(A,B,C,D,X[ 5], 4,0xfffa3942L); - R2(D,A,B,C,X[ 8],11,0x8771f681L); - R2(C,D,A,B,X[11],16,0x6d9d6122L); - R2(B,C,D,A,X[14],23,0xfde5380cL); - R2(A,B,C,D,X[ 1], 4,0xa4beea44L); - R2(D,A,B,C,X[ 4],11,0x4bdecfa9L); - R2(C,D,A,B,X[ 7],16,0xf6bb4b60L); - R2(B,C,D,A,X[10],23,0xbebfbc70L); - R2(A,B,C,D,X[13], 4,0x289b7ec6L); - R2(D,A,B,C,X[ 0],11,0xeaa127faL); - R2(C,D,A,B,X[ 3],16,0xd4ef3085L); - R2(B,C,D,A,X[ 6],23,0x04881d05L); - R2(A,B,C,D,X[ 9], 4,0xd9d4d039L); - R2(D,A,B,C,X[12],11,0xe6db99e5L); - R2(C,D,A,B,X[15],16,0x1fa27cf8L); - R2(B,C,D,A,X[ 2],23,0xc4ac5665L); - /* Round 3 */ - R3(A,B,C,D,X[ 0], 6,0xf4292244L); - R3(D,A,B,C,X[ 7],10,0x432aff97L); - R3(C,D,A,B,X[14],15,0xab9423a7L); - R3(B,C,D,A,X[ 5],21,0xfc93a039L); - R3(A,B,C,D,X[12], 6,0x655b59c3L); - R3(D,A,B,C,X[ 3],10,0x8f0ccc92L); - R3(C,D,A,B,X[10],15,0xffeff47dL); - R3(B,C,D,A,X[ 1],21,0x85845dd1L); - R3(A,B,C,D,X[ 8], 6,0x6fa87e4fL); - R3(D,A,B,C,X[15],10,0xfe2ce6e0L); - R3(C,D,A,B,X[ 6],15,0xa3014314L); - R3(B,C,D,A,X[13],21,0x4e0811a1L); - R3(A,B,C,D,X[ 4], 6,0xf7537e82L); - R3(D,A,B,C,X[11],10,0xbd3af235L); - R3(C,D,A,B,X[ 2],15,0x2ad7d2bbL); - R3(B,C,D,A,X[ 9],21,0xeb86d391L); - - A = c->A += A; - B = c->B += B; - C = c->C += C; - D = c->D += D; - } - } -#endif - #ifndef md5_block_data_order #ifdef X #undef X @@ -276,19 +186,3 @@ void md5_block_data_order (MD5_CTX *c, const void *data_, size_t num) } } #endif - -#ifdef undef -int printit(unsigned long *l) - { - int i,ii; - - for (i=0; i<2; i++) - { - for (ii=0; ii<8; ii++) - { - fprintf(stderr,"%08lx ",l[i*8+ii]); - } - fprintf(stderr,"\n"); - } - } -#endif diff --git a/crypto/md5/md5_locl.h b/crypto/md5/md5_locl.h index 94f395f27a..84e81b960d 100644 --- a/crypto/md5/md5_locl.h +++ b/crypto/md5/md5_locl.h @@ -66,53 +66,19 @@ #endif #ifdef MD5_ASM -# if defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) -# if !defined(B_ENDIAN) -# define md5_block_host_order md5_block_asm_host_order -# endif -# elif defined(__sparc) && defined(OPENSSL_SYS_ULTRASPARC) - void md5_block_asm_data_order_aligned (MD5_CTX *c, const MD5_LONG *p,size_t num); -# define HASH_BLOCK_DATA_ORDER_ALIGNED md5_block_asm_data_order_aligned +# if defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__INTEL__) || \ + defined(__x86_64) || defined(__x86_64__) || defined(_M_AMD64) || defined(_M_X64) +# define md5_block_data_order md5_block_asm_data_order # endif #endif -void md5_block_host_order (MD5_CTX *c, const void *p,size_t num); void md5_block_data_order (MD5_CTX *c, const void *p,size_t num); -#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__INTEL__) || defined(__x86_64) || defined(__x86_64__) -# if !defined(B_ENDIAN) -/* - * *_block_host_order is expected to handle aligned data while - * *_block_data_order - unaligned. As algorithm and host (x86) - * are in this case of the same "endianness" these two are - * otherwise indistinguishable. But normally you don't want to - * call the same function because unaligned access in places - * where alignment is expected is usually a "Bad Thing". Indeed, - * on RISCs you get punished with BUS ERROR signal or *severe* - * performance degradation. Intel CPUs are in turn perfectly - * capable of loading unaligned data without such drastic side - * effect. Yes, they say it's slower than aligned load, but no - * exception is generated and therefore performance degradation - * is *incomparable* with RISCs. What we should weight here is - * costs of unaligned access against costs of aligning data. - * According to my measurements allowing unaligned access results - * in ~9% performance improvement on Pentium II operating at - * 266MHz. I won't be surprised if the difference will be higher - * on faster systems:-) - * - * - */ -# define md5_block_data_order md5_block_host_order -# endif -#endif - #define DATA_ORDER_IS_LITTLE_ENDIAN #define HASH_LONG MD5_LONG -#define HASH_LONG_LOG2 MD5_LONG_LOG2 #define HASH_CTX MD5_CTX #define HASH_CBLOCK MD5_CBLOCK -#define HASH_LBLOCK MD5_LBLOCK #define HASH_UPDATE MD5_Update #define HASH_TRANSFORM MD5_Transform #define HASH_FINAL MD5_Final @@ -123,21 +89,7 @@ void md5_block_data_order (MD5_CTX *c, const void *p,size_t num); ll=(c)->C; HOST_l2c(ll,(s)); \ ll=(c)->D; HOST_l2c(ll,(s)); \ } while (0) -#define HASH_BLOCK_HOST_ORDER md5_block_host_order -#if !defined(L_ENDIAN) || defined(md5_block_data_order) #define HASH_BLOCK_DATA_ORDER md5_block_data_order -/* - * Little-endians (Intel and Alpha) feel better without this. - * It looks like memcpy does better job than generic - * md5_block_data_order on copying-n-aligning input data. - * But frankly speaking I didn't expect such result on Alpha. - * On the other hand I've got this with egcs-1.0.2 and if - * program is compiled with another (better?) compiler it - * might turn out other way around. - * - * - */ -#endif #include "md32_common.h" diff --git a/crypto/ripemd/asm/rmd-586.pl b/crypto/ripemd/asm/rmd-586.pl index 0ab6f76bff..4f3c4c967f 100644 --- a/crypto/ripemd/asm/rmd-586.pl +++ b/crypto/ripemd/asm/rmd-586.pl @@ -1,7 +1,7 @@ #!/usr/local/bin/perl # Normal is the -# ripemd160_block_asm_host_order(RIPEMD160_CTX *c, ULONG *X,int blocks); +# ripemd160_block_asm_data_order(RIPEMD160_CTX *c, ULONG *X,int blocks); $normal=0; @@ -56,7 +56,7 @@ $KR3=0x7A6D76E9; 8, 5,12, 9,12, 5,14, 6, 8,13, 6, 5,15,13,11,11, ); -&ripemd160_block("ripemd160_block_asm_host_order"); +&ripemd160_block("ripemd160_block_asm_data_order"); &asm_finish(); sub Xv diff --git a/crypto/ripemd/rmd_dgst.c b/crypto/ripemd/rmd_dgst.c index 8b22397fa4..3c4ad4ff10 100644 --- a/crypto/ripemd/rmd_dgst.c +++ b/crypto/ripemd/rmd_dgst.c @@ -84,207 +84,6 @@ FIPS_NON_FIPS_MD_Init(RIPEMD160) return 1; } -#ifndef ripemd160_block_host_order -#ifdef X -#undef X -#endif -#define X(i) XX[i] -void ripemd160_block_host_order (RIPEMD160_CTX *ctx, const void *p, size_t num) - { - const RIPEMD160_LONG *XX=p; - register unsigned MD32_REG_T A,B,C,D,E; - register unsigned MD32_REG_T a,b,c,d,e; - - for (;num--;XX+=HASH_LBLOCK) - { - - A=ctx->A; B=ctx->B; C=ctx->C; D=ctx->D; E=ctx->E; - - RIP1(A,B,C,D,E,WL00,SL00); - RIP1(E,A,B,C,D,WL01,SL01); - RIP1(D,E,A,B,C,WL02,SL02); - RIP1(C,D,E,A,B,WL03,SL03); - RIP1(B,C,D,E,A,WL04,SL04); - RIP1(A,B,C,D,E,WL05,SL05); - RIP1(E,A,B,C,D,WL06,SL06); - RIP1(D,E,A,B,C,WL07,SL07); - RIP1(C,D,E,A,B,WL08,SL08); - RIP1(B,C,D,E,A,WL09,SL09); - RIP1(A,B,C,D,E,WL10,SL10); - RIP1(E,A,B,C,D,WL11,SL11); - RIP1(D,E,A,B,C,WL12,SL12); - RIP1(C,D,E,A,B,WL13,SL13); - RIP1(B,C,D,E,A,WL14,SL14); - RIP1(A,B,C,D,E,WL15,SL15); - - RIP2(E,A,B,C,D,WL16,SL16,KL1); - RIP2(D,E,A,B,C,WL17,SL17,KL1); - RIP2(C,D,E,A,B,WL18,SL18,KL1); - RIP2(B,C,D,E,A,WL19,SL19,KL1); - RIP2(A,B,C,D,E,WL20,SL20,KL1); - RIP2(E,A,B,C,D,WL21,SL21,KL1); - RIP2(D,E,A,B,C,WL22,SL22,KL1); - RIP2(C,D,E,A,B,WL23,SL23,KL1); - RIP2(B,C,D,E,A,WL24,SL24,KL1); - RIP2(A,B,C,D,E,WL25,SL25,KL1); - RIP2(E,A,B,C,D,WL26,SL26,KL1); - RIP2(D,E,A,B,C,WL27,SL27,KL1); - RIP2(C,D,E,A,B,WL28,SL28,KL1); - RIP2(B,C,D,E,A,WL29,SL29,KL1); - RIP2(A,B,C,D,E,WL30,SL30,KL1); - RIP2(E,A,B,C,D,WL31,SL31,KL1); - - RIP3(D,E,A,B,C,WL32,SL32,KL2); - RIP3(C,D,E,A,B,WL33,SL33,KL2); - RIP3(B,C,D,E,A,WL34,SL34,KL2); - RIP3(A,B,C,D,E,WL35,SL35,KL2); - RIP3(E,A,B,C,D,WL36,SL36,KL2); - RIP3(D,E,A,B,C,WL37,SL37,KL2); - RIP3(C,D,E,A,B,WL38,SL38,KL2); - RIP3(B,C,D,E,A,WL39,SL39,KL2); - RIP3(A,B,C,D,E,WL40,SL40,KL2); - RIP3(E,A,B,C,D,WL41,SL41,KL2); - RIP3(D,E,A,B,C,WL42,SL42,KL2); - RIP3(C,D,E,A,B,WL43,SL43,KL2); - RIP3(B,C,D,E,A,WL44,SL44,KL2); - RIP3(A,B,C,D,E,WL45,SL45,KL2); - RIP3(E,A,B,C,D,WL46,SL46,KL2); - RIP3(D,E,A,B,C,WL47,SL47,KL2); - - RIP4(C,D,E,A,B,WL48,SL48,KL3); - RIP4(B,C,D,E,A,WL49,SL49,KL3); - RIP4(A,B,C,D,E,WL50,SL50,KL3); - RIP4(E,A,B,C,D,WL51,SL51,KL3); - RIP4(D,E,A,B,C,WL52,SL52,KL3); - RIP4(C,D,E,A,B,WL53,SL53,KL3); - RIP4(B,C,D,E,A,WL54,SL54,KL3); - RIP4(A,B,C,D,E,WL55,SL55,KL3); - RIP4(E,A,B,C,D,WL56,SL56,KL3); - RIP4(D,E,A,B,C,WL57,SL57,KL3); - RIP4(C,D,E,A,B,WL58,SL58,KL3); - RIP4(B,C,D,E,A,WL59,SL59,KL3); - RIP4(A,B,C,D,E,WL60,SL60,KL3); - RIP4(E,A,B,C,D,WL61,SL61,KL3); - RIP4(D,E,A,B,C,WL62,SL62,KL3); - RIP4(C,D,E,A,B,WL63,SL63,KL3); - - RIP5(B,C,D,E,A,WL64,SL64,KL4); - RIP5(A,B,C,D,E,WL65,SL65,KL4); - RIP5(E,A,B,C,D,WL66,SL66,KL4); - RIP5(D,E,A,B,C,WL67,SL67,KL4); - RIP5(C,D,E,A,B,WL68,SL68,KL4); - RIP5(B,C,D,E,A,WL69,SL69,KL4); - RIP5(A,B,C,D,E,WL70,SL70,KL4); - RIP5(E,A,B,C,D,WL71,SL71,KL4); - RIP5(D,E,A,B,C,WL72,SL72,KL4); - RIP5(C,D,E,A,B,WL73,SL73,KL4); - RIP5(B,C,D,E,A,WL74,SL74,KL4); - RIP5(A,B,C,D,E,WL75,SL75,KL4); - RIP5(E,A,B,C,D,WL76,SL76,KL4); - RIP5(D,E,A,B,C,WL77,SL77,KL4); - RIP5(C,D,E,A,B,WL78,SL78,KL4); - RIP5(B,C,D,E,A,WL79,SL79,KL4); - - a=A; b=B; c=C; d=D; e=E; - /* Do other half */ - A=ctx->A; B=ctx->B; C=ctx->C; D=ctx->D; E=ctx->E; - - RIP5(A,B,C,D,E,WR00,SR00,KR0); - RIP5(E,A,B,C,D,WR01,SR01,KR0); - RIP5(D,E,A,B,C,WR02,SR02,KR0); - RIP5(C,D,E,A,B,WR03,SR03,KR0); - RIP5(B,C,D,E,A,WR04,SR04,KR0); - RIP5(A,B,C,D,E,WR05,SR05,KR0); - RIP5(E,A,B,C,D,WR06,SR06,KR0); - RIP5(D,E,A,B,C,WR07,SR07,KR0); - RIP5(C,D,E,A,B,WR08,SR08,KR0); - RIP5(B,C,D,E,A,WR09,SR09,KR0); - RIP5(A,B,C,D,E,WR10,SR10,KR0); - RIP5(E,A,B,C,D,WR11,SR11,KR0); - RIP5(D,E,A,B,C,WR12,SR12,KR0); - RIP5(C,D,E,A,B,WR13,SR13,KR0); - RIP5(B,C,D,E,A,WR14,SR14,KR0); - RIP5(A,B,C,D,E,WR15,SR15,KR0); - - RIP4(E,A,B,C,D,WR16,SR16,KR1); - RIP4(D,E,A,B,C,WR17,SR17,KR1); - RIP4(C,D,E,A,B,WR18,SR18,KR1); - RIP4(B,C,D,E,A,WR19,SR19,KR1); - RIP4(A,B,C,D,E,WR20,SR20,KR1); - RIP4(E,A,B,C,D,WR21,SR21,KR1); - RIP4(D,E,A,B,C,WR22,SR22,KR1); - RIP4(C,D,E,A,B,WR23,SR23,KR1); - RIP4(B,C,D,E,A,WR24,SR24,KR1); - RIP4(A,B,C,D,E,WR25,SR25,KR1); - RIP4(E,A,B,C,D,WR26,SR26,KR1); - RIP4(D,E,A,B,C,WR27,SR27,KR1); - RIP4(C,D,E,A,B,WR28,SR28,KR1); - RIP4(B,C,D,E,A,WR29,SR29,KR1); - RIP4(A,B,C,D,E,WR30,SR30,KR1); - RIP4(E,A,B,C,D,WR31,SR31,KR1); - - RIP3(D,E,A,B,C,WR32,SR32,KR2); - RIP3(C,D,E,A,B,WR33,SR33,KR2); - RIP3(B,C,D,E,A,WR34,SR34,KR2); - RIP3(A,B,C,D,E,WR35,SR35,KR2); - RIP3(E,A,B,C,D,WR36,SR36,KR2); - RIP3(D,E,A,B,C,WR37,SR37,KR2); - RIP3(C,D,E,A,B,WR38,SR38,KR2); - RIP3(B,C,D,E,A,WR39,SR39,KR2); - RIP3(A,B,C,D,E,WR40,SR40,KR2); - RIP3(E,A,B,C,D,WR41,SR41,KR2); - RIP3(D,E,A,B,C,WR42,SR42,KR2); - RIP3(C,D,E,A,B,WR43,SR43,KR2); - RIP3(B,C,D,E,A,WR44,SR44,KR2); - RIP3(A,B,C,D,E,WR45,SR45,KR2); - RIP3(E,A,B,C,D,WR46,SR46,KR2); - RIP3(D,E,A,B,C,WR47,SR47,KR2); - - RIP2(C,D,E,A,B,WR48,SR48,KR3); - RIP2(B,C,D,E,A,WR49,SR49,KR3); - RIP2(A,B,C,D,E,WR50,SR50,KR3); - RIP2(E,A,B,C,D,WR51,SR51,KR3); - RIP2(D,E,A,B,C,WR52,SR52,KR3); - RIP2(C,D,E,A,B,WR53,SR53,KR3); - RIP2(B,C,D,E,A,WR54,SR54,KR3); - RIP2(A,B,C,D,E,WR55,SR55,KR3); - RIP2(E,A,B,C,D,WR56,SR56,KR3); - RIP2(D,E,A,B,C,WR57,SR57,KR3); - RIP2(C,D,E,A,B,WR58,SR58,KR3); - RIP2(B,C,D,E,A,WR59,SR59,KR3); - RIP2(A,B,C,D,E,WR60,SR60,KR3); - RIP2(E,A,B,C,D,WR61,SR61,KR3); - RIP2(D,E,A,B,C,WR62,SR62,KR3); - RIP2(C,D,E,A,B,WR63,SR63,KR3); - - RIP1(B,C,D,E,A,WR64,SR64); - RIP1(A,B,C,D,E,WR65,SR65); - RIP1(E,A,B,C,D,WR66,SR66); - RIP1(D,E,A,B,C,WR67,SR67); - RIP1(C,D,E,A,B,WR68,SR68); - RIP1(B,C,D,E,A,WR69,SR69); - RIP1(A,B,C,D,E,WR70,SR70); - RIP1(E,A,B,C,D,WR71,SR71); - RIP1(D,E,A,B,C,WR72,SR72); - RIP1(C,D,E,A,B,WR73,SR73); - RIP1(B,C,D,E,A,WR74,SR74); - RIP1(A,B,C,D,E,WR75,SR75); - RIP1(E,A,B,C,D,WR76,SR76); - RIP1(D,E,A,B,C,WR77,SR77); - RIP1(C,D,E,A,B,WR78,SR78); - RIP1(B,C,D,E,A,WR79,SR79); - - D =ctx->B+c+D; - ctx->B=ctx->C+d+E; - ctx->C=ctx->D+e+A; - ctx->D=ctx->E+a+B; - ctx->E=ctx->A+b+C; - ctx->A=D; - - } - } -#endif - #ifndef ripemd160_block_data_order #ifdef X #undef X diff --git a/crypto/ripemd/rmd_locl.h b/crypto/ripemd/rmd_locl.h index b52d786165..ce12a8000e 100644 --- a/crypto/ripemd/rmd_locl.h +++ b/crypto/ripemd/rmd_locl.h @@ -72,32 +72,20 @@ */ #ifdef RMD160_ASM # if defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__INTEL__) -# if !defined(B_ENDIAN) -# define ripemd160_block_host_order ripemd160_block_asm_host_order -# endif +# define ripemd160_block_host_order ripemd160_block_asm_data_order # endif #endif -void ripemd160_block_host_order (RIPEMD160_CTX *c, const void *p,size_t num); void ripemd160_block_data_order (RIPEMD160_CTX *c, const void *p,size_t num); -#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__INTEL__) -# if !defined(B_ENDIAN) -# define ripemd160_block_data_order ripemd160_block_host_order -# endif -#endif - #define DATA_ORDER_IS_LITTLE_ENDIAN #define HASH_LONG RIPEMD160_LONG -#define HASH_LONG_LOG2 RIPEMD160_LONG_LOG2 #define HASH_CTX RIPEMD160_CTX #define HASH_CBLOCK RIPEMD160_CBLOCK -#define HASH_LBLOCK RIPEMD160_LBLOCK #define HASH_UPDATE RIPEMD160_Update #define HASH_TRANSFORM RIPEMD160_Transform #define HASH_FINAL RIPEMD160_Final -#define HASH_BLOCK_HOST_ORDER ripemd160_block_host_order #define HASH_MAKE_STRING(c,s) do { \ unsigned long ll; \ ll=(c)->A; HOST_l2c(ll,(s)); \ @@ -106,9 +94,7 @@ void ripemd160_block_data_order (RIPEMD160_CTX *c, const void *p,size_t num); ll=(c)->D; HOST_l2c(ll,(s)); \ ll=(c)->E; HOST_l2c(ll,(s)); \ } while (0) -#if !defined(L_ENDIAN) || defined(ripemd160_block_data_order) #define HASH_BLOCK_DATA_ORDER ripemd160_block_data_order -#endif #include "md32_common.h" diff --git a/crypto/sha/asm/sha1-586.pl b/crypto/sha/asm/sha1-586.pl index 4f8521f1e2..0b4dab2bd5 100644 --- a/crypto/sha/asm/sha1-586.pl +++ b/crypto/sha/asm/sha1-586.pl @@ -1,4 +1,16 @@ -#!/usr/local/bin/perl +#!/usr/bin/env perl + +# ==================================================================== +# [Re]written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== + +# "[Re]written" was achieved in two major overhauls. In 2004 BODY_* +# functions were re-implemented to address P4 performance issue [see +# commentary below], and in 2006 the rest was rewritten in order to +# gain freedom to liberate licensing terms. # It was noted that Intel IA-32 C compiler generates code which # performs ~30% *faster* on P4 CPU than original *hand-coded* @@ -17,90 +29,27 @@ # improvement on P4 outweights the loss and incorporate this # re-tuned code to 0.9.7 and later. # ---------------------------------------------------------------- -# Those who for any particular reason absolutely must score on -# Pentium can replace this module with one from 0.9.6 distribution. -# This "offer" shall be revoked the moment programming interface to -# this module is changed, in which case this paragraph should be -# removed. -# ---------------------------------------------------------------- # -$normal=0; - -push(@INC,"perlasm","../../perlasm"); +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; &asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386"); $A="eax"; -$B="ecx"; -$C="ebx"; +$B="ebx"; +$C="ecx"; $D="edx"; $E="edi"; $T="esi"; $tmp1="ebp"; -$off=9*4; - -@K=(0x5a827999,0x6ed9eba1,0x8f1bbcdc,0xca62c1d6); - -&sha1_block_data("sha1_block_asm_data_order"); - -&asm_finish(); - -sub Nn - { - local($p)=@_; - local(%n)=($A,$T,$B,$A,$C,$B,$D,$C,$E,$D,$T,$E); - return($n{$p}); - } - -sub Np - { - local($p)=@_; - local(%n)=($A,$T,$B,$A,$C,$B,$D,$C,$E,$D,$T,$E); - local(%n)=($A,$B,$B,$C,$C,$D,$D,$E,$E,$T,$T,$A); - return($n{$p}); - } - -sub Na - { - local($n)=@_; - return( (($n )&0x0f), - (($n+ 2)&0x0f), - (($n+ 8)&0x0f), - (($n+13)&0x0f), - (($n+ 1)&0x0f)); - } - -sub X_expand - { - local($in)=@_; - - &comment("First, load the words onto the stack in network byte order"); - for ($i=0; $i<16; $i+=2) - { - &mov($A,&DWP(($i+0)*4,$in,"",0));# unless $i == 0; - &mov($B,&DWP(($i+1)*4,$in,"",0)); - &bswap($A); - &bswap($B); - &mov(&swtmp($i+0),$A); - &mov(&swtmp($i+1),$B); - } - - &comment("We now have the X array on the stack"); - &comment("starting at sp-4"); - } - -# Rules of engagement -# F is always trashable at the start, the running total. -# E becomes the next F so it can be trashed after it has been 'accumulated' -# F becomes A in the next round. We don't need to access it much. -# During the X update part, the result ends up in $X[$n0]. +@V=($A,$B,$C,$D,$E,$T); sub BODY_00_15 { - local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_; + local($n,$a,$b,$c,$d,$e,$f)=@_; &comment("00_15 $n"); @@ -109,37 +58,37 @@ sub BODY_00_15 else { &mov($a,$tmp1); } &rotl($tmp1,5); # tmp1=ROTATE(a,5) &xor($f,$d); - &and($f,$b); - &add($tmp1,$e); # tmp1+=e; - &mov($e,&swtmp($n)); # e becomes volatile and - # is loaded with xi + &add($tmp1,$e); # tmp1+=e; + &and($f,$b); + &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded + # with xi, also note that e becomes + # f in next round... &xor($f,$d); # f holds F_00_19(b,c,d) &rotr($b,2); # b=ROTATE(b,30) - &lea($tmp1,&DWP($K,$tmp1,$e,1));# tmp1+=K_00_19+xi + &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi if ($n==15) { &add($f,$tmp1); } # f+=tmp1 - else { &add($tmp1,$f); } + else { &add($tmp1,$f); } # f becomes a in next round } sub BODY_16_19 { - local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_; - local($n0,$n1,$n2,$n3,$np)=&Na($n); + local($n,$a,$b,$c,$d,$e,$f)=@_; &comment("16_19 $n"); - &mov($f,&swtmp($n1)); # f to hold Xupdate(xi,xa,xb,xc,xd) + &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d) - &xor($f,&swtmp($n0)); + &xor($f,&swtmp(($n+2)%16)); &xor($tmp1,$d); - &xor($f,&swtmp($n2)); + &xor($f,&swtmp(($n+8)%16)); &and($tmp1,$b); # tmp1 holds F_00_19(b,c,d) &rotr($b,2); # b=ROTATE(b,30) - &xor($f,&swtmp($n3)); # f holds xa^xb^xc^xd - &rotl($f,1); # f=ROATE(f,1) + &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd + &rotl($f,1); # f=ROTATE(f,1) &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) - &mov(&swtmp($n0),$f); # xi=f - &lea($f,&DWP($K,$f,$e,1)); # f+=K_00_19+e + &mov(&swtmp($n%16),$f); # xi=f + &lea($f,&DWP(0x5a827999,$f,$e));# f+=K_00_19+e &mov($e,$a); # e becomes volatile &rotl($e,5); # e=ROTATE(a,5) &add($f,$tmp1); # f+=F_00_19(b,c,d) @@ -148,48 +97,47 @@ sub BODY_16_19 sub BODY_20_39 { - local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_; + local($n,$a,$b,$c,$d,$e,$f)=@_; + local $K=($n<40)?0x6ed9eba1:0xca62c1d6; &comment("20_39 $n"); - local($n0,$n1,$n2,$n3,$np)=&Na($n); &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d) - &mov($f,&swtmp($n0)); # f to hold Xupdate(xi,xa,xb,xc,xd) + &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &rotr($b,2); # b=ROTATE(b,30) - &xor($f,&swtmp($n1)); + &xor($f,&swtmp(($n+2)%16)); &xor($tmp1,$c); - &xor($f,&swtmp($n2)); + &xor($f,&swtmp(($n+8)%16)); &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) - &xor($f,&swtmp($n3)); # f holds xa^xb^xc^xd + &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) &add($tmp1,$e); - &mov(&swtmp($n0),$f); # xi=f + &mov(&swtmp($n%16),$f); # xi=f &mov($e,$a); # e becomes volatile &rotl($e,5); # e=ROTATE(a,5) - &lea($f,&DWP($K,$f,$tmp1,1)); # f+=K_20_39+e + &lea($f,&DWP($K,$f,$tmp1)); # f+=K_20_39+e &add($f,$e); # f+=ROTATE(a,5) } sub BODY_40_59 { - local($pos,$K,$X,$n,$a,$b,$c,$d,$e,$f)=@_; + local($n,$a,$b,$c,$d,$e,$f)=@_; &comment("40_59 $n"); - local($n0,$n1,$n2,$n3,$np)=&Na($n); - &mov($f,&swtmp($n0)); # f to hold Xupdate(xi,xa,xb,xc,xd) - &mov($tmp1,&swtmp($n1)); + &mov($f,&swtmp($n%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) + &mov($tmp1,&swtmp(($n+2)%16)); &xor($f,$tmp1); - &mov($tmp1,&swtmp($n2)); + &mov($tmp1,&swtmp(($n+8)%16)); &xor($f,$tmp1); - &mov($tmp1,&swtmp($n3)); + &mov($tmp1,&swtmp(($n+13)%16)); &xor($f,$tmp1); # f holds xa^xb^xc^xd &mov($tmp1,$b); # tmp1 to hold F_40_59(b,c,d) &rotl($f,1); # f=ROTATE(f,1) &or($tmp1,$c); - &mov(&swtmp($n0),$f); # xi=f + &mov(&swtmp($n%16),$f); # xi=f &and($tmp1,$d); - &lea($f,&DWP($K,$f,$e,1)); # f+=K_40_59+e + &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e &mov($e,$b); # e becomes volatile and is used # to calculate F_40_59(b,c,d) &rotr($b,2); # b=ROTATE(b,30) @@ -201,230 +149,71 @@ sub BODY_40_59 &add($f,$e); # f+=ROTATE(a,5) } -sub BODY_60_79 - { - &BODY_20_39(@_); - } - -sub sha1_block_host - { - local($name, $sclabel)=@_; - - &function_begin_B($name,""); - - # parameter 1 is the MD5_CTX structure. - # A 0 - # B 4 - # C 8 - # D 12 - # E 16 - - &mov("ecx", &wparam(2)); - &push("esi"); - &shl("ecx",6); - &mov("esi", &wparam(1)); - &push("ebp"); - &add("ecx","esi"); # offset to leave on - &push("ebx"); - &mov("ebp", &wparam(0)); - &push("edi"); - &mov($D, &DWP(12,"ebp","",0)); - &stack_push(18+9); - &mov($E, &DWP(16,"ebp","",0)); - &mov($C, &DWP( 8,"ebp","",0)); - &mov(&swtmp(17),"ecx"); +&function_begin("sha1_block_data_order",16); + &mov($tmp1,&wparam(0)); # SHA_CTX *c + &mov($T,&wparam(1)); # const void *input + &mov($A,&wparam(2)); # size_t num + &stack_push(16); # allocate X[16] + &shl($A,6); + &add($A,$T); + &mov(&wparam(2),$A); # pointer beyond the end of input + &mov($E,&DWP(16,$tmp1));# pre-load E - &comment("First we need to setup the X array"); + &set_label("loop",16); - for ($i=0; $i<16; $i+=2) + # copy input chunk to X, but reversing byte order! + for ($i=0; $i<16; $i+=4) { - &mov($A,&DWP(($i+0)*4,"esi","",0));# unless $i == 0; - &mov($B,&DWP(($i+1)*4,"esi","",0)); + &mov($A,&DWP(4*($i+0),$T)); + &mov($B,&DWP(4*($i+1),$T)); + &mov($C,&DWP(4*($i+2),$T)); + &mov($D,&DWP(4*($i+3),$T)); + &bswap($A); + &bswap($B); + &bswap($C); + &bswap($D); &mov(&swtmp($i+0),$A); - &mov(&swtmp($i+1),$B); + &mov(&swtmp($i+1),$B); + &mov(&swtmp($i+2),$C); + &mov(&swtmp($i+3),$D); } - &jmp($sclabel); - &function_end_B($name); - } - - -sub sha1_block_data - { - local($name)=@_; - - &function_begin_B($name,""); - - # parameter 1 is the MD5_CTX structure. - # A 0 - # B 4 - # C 8 - # D 12 - # E 16 - - &mov("ecx", &wparam(2)); - &push("esi"); - &shl("ecx",6); - &mov("esi", &wparam(1)); - &push("ebp"); - &add("ecx","esi"); # offset to leave on - &push("ebx"); - &mov("ebp", &wparam(0)); - &push("edi"); - &mov($D, &DWP(12,"ebp","",0)); - &stack_push(18+9); - &mov($E, &DWP(16,"ebp","",0)); - &mov($C, &DWP( 8,"ebp","",0)); - &mov(&swtmp(17),"ecx"); - - &comment("First we need to setup the X array"); - - &set_label("start") unless $normal; - - &X_expand("esi"); - &mov(&wparam(1),"esi"); - - &set_label("shortcut", 0, 1); - &comment(""); - &comment("Start processing"); - - # odd start - &mov($A, &DWP( 0,"ebp","",0)); - &mov($B, &DWP( 4,"ebp","",0)); - $X="esp"; - &BODY_00_15(-2,$K[0],$X, 0,$A,$B,$C,$D,$E,$T); - &BODY_00_15( 0,$K[0],$X, 1,$T,$A,$B,$C,$D,$E); - &BODY_00_15( 0,$K[0],$X, 2,$E,$T,$A,$B,$C,$D); - &BODY_00_15( 0,$K[0],$X, 3,$D,$E,$T,$A,$B,$C); - &BODY_00_15( 0,$K[0],$X, 4,$C,$D,$E,$T,$A,$B); - &BODY_00_15( 0,$K[0],$X, 5,$B,$C,$D,$E,$T,$A); - &BODY_00_15( 0,$K[0],$X, 6,$A,$B,$C,$D,$E,$T); - &BODY_00_15( 0,$K[0],$X, 7,$T,$A,$B,$C,$D,$E); - &BODY_00_15( 0,$K[0],$X, 8,$E,$T,$A,$B,$C,$D); - &BODY_00_15( 0,$K[0],$X, 9,$D,$E,$T,$A,$B,$C); - &BODY_00_15( 0,$K[0],$X,10,$C,$D,$E,$T,$A,$B); - &BODY_00_15( 0,$K[0],$X,11,$B,$C,$D,$E,$T,$A); - &BODY_00_15( 0,$K[0],$X,12,$A,$B,$C,$D,$E,$T); - &BODY_00_15( 0,$K[0],$X,13,$T,$A,$B,$C,$D,$E); - &BODY_00_15( 0,$K[0],$X,14,$E,$T,$A,$B,$C,$D); - &BODY_00_15( 1,$K[0],$X,15,$D,$E,$T,$A,$B,$C); - &BODY_16_19(-1,$K[0],$X,16,$C,$D,$E,$T,$A,$B); - &BODY_16_19( 0,$K[0],$X,17,$B,$C,$D,$E,$T,$A); - &BODY_16_19( 0,$K[0],$X,18,$A,$B,$C,$D,$E,$T); - &BODY_16_19( 1,$K[0],$X,19,$T,$A,$B,$C,$D,$E); - - &BODY_20_39(-1,$K[1],$X,20,$E,$T,$A,$B,$C,$D); - &BODY_20_39( 0,$K[1],$X,21,$D,$E,$T,$A,$B,$C); - &BODY_20_39( 0,$K[1],$X,22,$C,$D,$E,$T,$A,$B); - &BODY_20_39( 0,$K[1],$X,23,$B,$C,$D,$E,$T,$A); - &BODY_20_39( 0,$K[1],$X,24,$A,$B,$C,$D,$E,$T); - &BODY_20_39( 0,$K[1],$X,25,$T,$A,$B,$C,$D,$E); - &BODY_20_39( 0,$K[1],$X,26,$E,$T,$A,$B,$C,$D); - &BODY_20_39( 0,$K[1],$X,27,$D,$E,$T,$A,$B,$C); - &BODY_20_39( 0,$K[1],$X,28,$C,$D,$E,$T,$A,$B); - &BODY_20_39( 0,$K[1],$X,29,$B,$C,$D,$E,$T,$A); - &BODY_20_39( 0,$K[1],$X,30,$A,$B,$C,$D,$E,$T); - &BODY_20_39( 0,$K[1],$X,31,$T,$A,$B,$C,$D,$E); - &BODY_20_39( 0,$K[1],$X,32,$E,$T,$A,$B,$C,$D); - &BODY_20_39( 0,$K[1],$X,33,$D,$E,$T,$A,$B,$C); - &BODY_20_39( 0,$K[1],$X,34,$C,$D,$E,$T,$A,$B); - &BODY_20_39( 0,$K[1],$X,35,$B,$C,$D,$E,$T,$A); - &BODY_20_39( 0,$K[1],$X,36,$A,$B,$C,$D,$E,$T); - &BODY_20_39( 0,$K[1],$X,37,$T,$A,$B,$C,$D,$E); - &BODY_20_39( 0,$K[1],$X,38,$E,$T,$A,$B,$C,$D); - &BODY_20_39( 1,$K[1],$X,39,$D,$E,$T,$A,$B,$C); - - &BODY_40_59(-1,$K[2],$X,40,$C,$D,$E,$T,$A,$B); - &BODY_40_59( 0,$K[2],$X,41,$B,$C,$D,$E,$T,$A); - &BODY_40_59( 0,$K[2],$X,42,$A,$B,$C,$D,$E,$T); - &BODY_40_59( 0,$K[2],$X,43,$T,$A,$B,$C,$D,$E); - &BODY_40_59( 0,$K[2],$X,44,$E,$T,$A,$B,$C,$D); - &BODY_40_59( 0,$K[2],$X,45,$D,$E,$T,$A,$B,$C); - &BODY_40_59( 0,$K[2],$X,46,$C,$D,$E,$T,$A,$B); - &BODY_40_59( 0,$K[2],$X,47,$B,$C,$D,$E,$T,$A); - &BODY_40_59( 0,$K[2],$X,48,$A,$B,$C,$D,$E,$T); - &BODY_40_59( 0,$K[2],$X,49,$T,$A,$B,$C,$D,$E); - &BODY_40_59( 0,$K[2],$X,50,$E,$T,$A,$B,$C,$D); - &BODY_40_59( 0,$K[2],$X,51,$D,$E,$T,$A,$B,$C); - &BODY_40_59( 0,$K[2],$X,52,$C,$D,$E,$T,$A,$B); - &BODY_40_59( 0,$K[2],$X,53,$B,$C,$D,$E,$T,$A); - &BODY_40_59( 0,$K[2],$X,54,$A,$B,$C,$D,$E,$T); - &BODY_40_59( 0,$K[2],$X,55,$T,$A,$B,$C,$D,$E); - &BODY_40_59( 0,$K[2],$X,56,$E,$T,$A,$B,$C,$D); - &BODY_40_59( 0,$K[2],$X,57,$D,$E,$T,$A,$B,$C); - &BODY_40_59( 0,$K[2],$X,58,$C,$D,$E,$T,$A,$B); - &BODY_40_59( 1,$K[2],$X,59,$B,$C,$D,$E,$T,$A); - - &BODY_60_79(-1,$K[3],$X,60,$A,$B,$C,$D,$E,$T); - &BODY_60_79( 0,$K[3],$X,61,$T,$A,$B,$C,$D,$E); - &BODY_60_79( 0,$K[3],$X,62,$E,$T,$A,$B,$C,$D); - &BODY_60_79( 0,$K[3],$X,63,$D,$E,$T,$A,$B,$C); - &BODY_60_79( 0,$K[3],$X,64,$C,$D,$E,$T,$A,$B); - &BODY_60_79( 0,$K[3],$X,65,$B,$C,$D,$E,$T,$A); - &BODY_60_79( 0,$K[3],$X,66,$A,$B,$C,$D,$E,$T); - &BODY_60_79( 0,$K[3],$X,67,$T,$A,$B,$C,$D,$E); - &BODY_60_79( 0,$K[3],$X,68,$E,$T,$A,$B,$C,$D); - &BODY_60_79( 0,$K[3],$X,69,$D,$E,$T,$A,$B,$C); - &BODY_60_79( 0,$K[3],$X,70,$C,$D,$E,$T,$A,$B); - &BODY_60_79( 0,$K[3],$X,71,$B,$C,$D,$E,$T,$A); - &BODY_60_79( 0,$K[3],$X,72,$A,$B,$C,$D,$E,$T); - &BODY_60_79( 0,$K[3],$X,73,$T,$A,$B,$C,$D,$E); - &BODY_60_79( 0,$K[3],$X,74,$E,$T,$A,$B,$C,$D); - &BODY_60_79( 0,$K[3],$X,75,$D,$E,$T,$A,$B,$C); - &BODY_60_79( 0,$K[3],$X,76,$C,$D,$E,$T,$A,$B); - &BODY_60_79( 0,$K[3],$X,77,$B,$C,$D,$E,$T,$A); - &BODY_60_79( 0,$K[3],$X,78,$A,$B,$C,$D,$E,$T); - &BODY_60_79( 2,$K[3],$X,79,$T,$A,$B,$C,$D,$E); - - &comment("End processing"); - &comment(""); - # D is the tmp value - - # E -> A - # T -> B - # A -> C - # B -> D - # C -> E - # D -> T - - &mov($tmp1,&wparam(0)); - - &mov($D, &DWP(12,$tmp1,"",0)); - &add($D,$B); - &mov($B, &DWP( 4,$tmp1,"",0)); - &add($B,$T); - &mov($T, $A); - &mov($A, &DWP( 0,$tmp1,"",0)); - &mov(&DWP(12,$tmp1,"",0),$D); - - &add($A,$E); - &mov($E, &DWP(16,$tmp1,"",0)); - &add($E,$C); - &mov($C, &DWP( 8,$tmp1,"",0)); - &add($C,$T); - - &mov(&DWP( 0,$tmp1,"",0),$A); - &mov("esi",&wparam(1)); - &mov(&DWP( 8,$tmp1,"",0),$C); - &add("esi",64); - &mov("eax",&swtmp(17)); - &mov(&DWP(16,$tmp1,"",0),$E); - &cmp("esi","eax"); - &mov(&DWP( 4,$tmp1,"",0),$B); - &jb(&label("start")); - - &stack_pop(18+9); - &pop("edi"); - &pop("ebx"); - &pop("ebp"); - &pop("esi"); - &ret(); - - # keep a note of shortcut label so it can be used outside - # block. - my $sclabel = &label("shortcut"); - - &function_end_B($name); - # Putting this here avoids problems with MASM in debugging mode - &sha1_block_host("sha1_block_asm_host_order", $sclabel); - } + &mov(&wparam(1),$T); # redundant in 1st spin + + &mov($A,&DWP(0,$tmp1)); # load SHA_CTX + &mov($B,&DWP(4,$tmp1)); + &mov($C,&DWP(8,$tmp1)); + &mov($D,&DWP(12,$tmp1)); + # E is pre-loaded + + for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } + for(;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); } + for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } + for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } + for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } + + (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check + + &mov($tmp1,&wparam(0)); # re-load SHA_CTX* + &mov($D,&wparam(1)); # D is last "T" and is discarded + + &add($E,&DWP(0,$tmp1)); # E is last "A"... + &add($T,&DWP(4,$tmp1)); + &add($A,&DWP(8,$tmp1)); + &add($B,&DWP(12,$tmp1)); + &add($C,&DWP(16,$tmp1)); + + &mov(&DWP(0,$tmp1),$E); # update SHA_CTX + &add($D,64); # advance input pointer + &mov(&DWP(4,$tmp1),$T); + &cmp($D,&wparam(2)); # have we reached the end yet? + &mov(&DWP(8,$tmp1),$A); + &mov($E,$C); # C is last "E" which needs to be "pre-loaded" + &mov(&DWP(12,$tmp1),$B); + &mov($T,$D); # input pointer + &mov(&DWP(16,$tmp1),$C); + &jb(&label("loop")); + + &stack_pop(16); +&function_end("sha1_block_data_order"); +&asm_finish(); diff --git a/crypto/sha/asm/sha1-ia64.pl b/crypto/sha/asm/sha1-ia64.pl index cb9dfad124..aa18c1089b 100644 --- a/crypto/sha/asm/sha1-ia64.pl +++ b/crypto/sha/asm/sha1-ia64.pl @@ -2,8 +2,9 @@ # # ==================================================================== # Written by Andy Polyakov for the OpenSSL -# project. Rights for redistribution and usage in source and binary -# forms are granted according to the OpenSSL license. +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # Eternal question is what's wrong with compiler generated code? The @@ -11,15 +12,10 @@ # to perform rotations by maintaining copy of 32-bit value in upper # bits of 64-bit register. Just follow mux2 and shrp instructions... # Performance under big-endian OS such as HP-UX is 179MBps*1GHz, which -# is >50% better than HP C and >2x better than gcc. As of this moment -# performance under little-endian OS such as Linux and Windows will be -# a bit lower, because data has to be picked in reverse byte-order. -# It's possible to resolve this issue by implementing third function, -# sha1_block_asm_data_order_aligned, which would temporarily flip -# BE field in User Mask register... +# is >50% better than HP C and >2x better than gcc. $code=<<___; -.ident \"sha1-ia64.s, version 1.0\" +.ident \"sha1-ia64.s, version 1.2\" .ident \"IA-64 ISA artwork by Andy Polyakov \" .explicit @@ -55,63 +51,55 @@ else { sub BODY_00_15 { local *code=shift; -local ($i,$a,$b,$c,$d,$e,$f,$unaligned)=@_; +local ($i,$a,$b,$c,$d,$e,$f)=@_; -if ($unaligned) { - $code.=<<___; -{ .mmi; ld1 tmp0=[inp],2 // MSB - ld1 tmp1=[tmp3],2 };; -{ .mmi; ld1 tmp2=[inp],2 - ld1 $X[$i&0xf]=[tmp3],2 // LSB - dep tmp1=tmp0,tmp1,8,8 };; -{ .mii; cmp.ne p16,p0=r0,r0 // no misaligned prefetch - dep $X[$i&0xf]=tmp2,$X[$i&0xf],8,8;; - dep $X[$i&0xf]=tmp1,$X[$i&0xf],16,16 };; -{ .mmi; nop.m 0 -___ - } -elsif ($i<15) { - $code.=<<___; -{ .mmi; ld4 $X[($i+1)&0xf]=[inp],4 // prefetch -___ - } -else { - $code.=<<___; -{ .mmi; nop.m 0 +$code.=<<___ if ($i==0); +{ .mmi; ld1 $X[$i&0xf]=[inp],2 // MSB + ld1 tmp2=[tmp3],2 };; +{ .mmi; ld1 tmp0=[inp],2 + ld1 tmp4=[tmp3],2 // LSB + dep $X[$i&0xf]=$X[$i&0xf],tmp2,8,8 };; ___ - } if ($i<15) { $code.=<<___; - and tmp0=$c,$b - dep.z tmp5=$a,5,27 } // a<<5 +{ .mmi; ld1 $X[($i+1)&0xf]=[inp],2 // +1 + dep tmp1=tmp0,tmp4,8,8 };; +{ .mmi; ld1 tmp2=[tmp3],2 // +1 + and tmp4=$c,$b + dep $X[$i&0xf]=$X[$i&0xf],tmp1,16,16 } //;; { .mmi; andcm tmp1=$d,$b - add tmp4=$e,$K_00_19 };; -{ .mmi; or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) - add $f=tmp4,$X[$i&0xf] // f=xi+e+K_00_19 + add tmp0=$e,$K_00_19 + dep.z tmp5=$a,5,27 };; // a<<5 +{ .mmi; or tmp4=tmp4,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) + add $f=tmp0,$X[$i&0xf] // f=xi+e+K_00_19 extr.u tmp1=$a,27,5 };; // a>>27 -{ .mib; add $f=$f,tmp0 // f+=F_00_19(b,c,d) +{ .mmi; ld1 tmp0=[inp],2 // +1 + add $f=$f,tmp4 // f+=F_00_19(b,c,d) shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) -{ .mib; or tmp1=tmp1,tmp5 // ROTATE(a,5) +{ .mmi; ld1 tmp4=[tmp3],2 // +1 + or tmp5=tmp1,tmp5 // ROTATE(a,5) mux2 tmp6=$a,0x44 };; // see b in next iteration -{ .mii; add $f=$f,tmp1 // f+=ROTATE(a,5) - mux2 $X[$i&0xf]=$X[$i&0xf],0x44 - nop.i 0 };; +{ .mii; add $f=$f,tmp5 // f+=ROTATE(a,5) + dep $X[($i+1)&0xf]=$X[($i+1)&0xf],tmp2,8,8 // +1 + mux2 $X[$i&0xf]=$X[$i&0xf],0x44 } //;; ___ } else { $code.=<<___; - and tmp0=$c,$b - dep.z tmp5=$a,5,27 } // a<<5 ;;? +{ .mii; and tmp3=$c,$b + dep tmp1=tmp0,tmp4,8,8;; + dep $X[$i&0xf]=$X[$i&0xf],tmp1,16,16 } //;; { .mmi; andcm tmp1=$d,$b - add tmp4=$e,$K_00_19 };; -{ .mmi; or tmp0=tmp0,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) - add $f=tmp4,$X[$i&0xf] // f=xi+e+K_00_19 + add tmp0=$e,$K_00_19 + dep.z tmp5=$a,5,27 };; // a<<5 +{ .mmi; or tmp4=tmp3,tmp1 // F_00_19(b,c,d)=(b&c)|(~b&d) + add $f=tmp0,$X[$i&0xf] // f=xi+e+K_00_19 extr.u tmp1=$a,27,5 } // a>>27 { .mmi; xor tmp2=$X[($i+0+1)&0xf],$X[($i+2+1)&0xf] // +1 xor tmp3=$X[($i+8+1)&0xf],$X[($i+13+1)&0xf] // +1 nop.i 0 };; -{ .mmi; add $f=$f,tmp0 // f+=F_00_19(b,c,d) +{ .mmi; add $f=$f,tmp4 // f+=F_00_19(b,c,d) xor tmp2=tmp2,tmp3 // +1 shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) @@ -190,9 +178,7 @@ $code.=<<___; extr.u tmp1=$a,27,5 } // a>>27 { .mib; add $f=$f,tmp4 // f+=e+K_20_39 add $h1=$h1,$a };; // wrap up -{ .mmi; -(p16) ld4.s $X[0]=[inp],4 // non-faulting prefetch - add $f=$f,tmp0 // f+=F_20_39(b,c,d) +{ .mmi; add $f=$f,tmp0 // f+=F_20_39(b,c,d) shrp $b=tmp6,tmp6,2 } // b=ROTATE(b,30) ;;? { .mmi; or tmp1=tmp1,tmp5 // ROTATE(a,5) add $h3=$h3,$c };; // wrap up @@ -245,172 +231,15 @@ tmp3=r11; ctx=r32; // in0 inp=r33; // in1 -// void sha1_block_asm_host_order(SHA_CTX *c,const void *p,size_t num); -.global sha1_block_asm_host_order# -.proc sha1_block_asm_host_order# +// void sha1_block_data_order(SHA_CTX *c,const void *p,size_t num); +.global sha1_block_data_order# +.proc sha1_block_data_order# .align 32 -sha1_block_asm_host_order: +sha1_block_data_order: .prologue - .fframe 0 - .save ar.pfs,r0 - .save ar.lc,r3 { .mmi; alloc tmp1=ar.pfs,3,15,0,0 $ADDP tmp0=4,ctx - mov r3=ar.lc } -{ .mmi; $ADDP ctx=0,ctx - $ADDP inp=0,inp - mov r2=pr };; -tmp4=in2; -tmp5=loc13; -tmp6=loc14; - .body -{ .mlx; ld4 $h0=[ctx],8 - movl $K_00_19=0x5a827999 } -{ .mlx; ld4 $h1=[tmp0],8 - movl $K_20_39=0x6ed9eba1 };; -{ .mlx; ld4 $h2=[ctx],8 - movl $K_40_59=0x8f1bbcdc } -{ .mlx; ld4 $h3=[tmp0] - movl $K_60_79=0xca62c1d6 };; -{ .mmi; ld4 $h4=[ctx],-16 - add in2=-1,in2 // adjust num for ar.lc - mov ar.ec=1 };; -{ .mmi; ld4 $X[0]=[inp],4 // prefetch - cmp.ne p16,p0=r0,in2 // prefecth at loop end - mov ar.lc=in2 };; // brp.loop.imp: too far - -.Lhtop: -{ .mmi; mov $A=$h0 - mov $B=$h1 - mux2 tmp6=$h1,0x44 } -{ .mmi; mov $C=$h2 - mov $D=$h3 - mov $E=$h4 };; - -___ - - &BODY_00_15(\$code, 0,$A,$B,$C,$D,$E,$T); - &BODY_00_15(\$code, 1,$T,$A,$B,$C,$D,$E); - &BODY_00_15(\$code, 2,$E,$T,$A,$B,$C,$D); - &BODY_00_15(\$code, 3,$D,$E,$T,$A,$B,$C); - &BODY_00_15(\$code, 4,$C,$D,$E,$T,$A,$B); - &BODY_00_15(\$code, 5,$B,$C,$D,$E,$T,$A); - &BODY_00_15(\$code, 6,$A,$B,$C,$D,$E,$T); - &BODY_00_15(\$code, 7,$T,$A,$B,$C,$D,$E); - &BODY_00_15(\$code, 8,$E,$T,$A,$B,$C,$D); - &BODY_00_15(\$code, 9,$D,$E,$T,$A,$B,$C); - &BODY_00_15(\$code,10,$C,$D,$E,$T,$A,$B); - &BODY_00_15(\$code,11,$B,$C,$D,$E,$T,$A); - &BODY_00_15(\$code,12,$A,$B,$C,$D,$E,$T); - &BODY_00_15(\$code,13,$T,$A,$B,$C,$D,$E); - &BODY_00_15(\$code,14,$E,$T,$A,$B,$C,$D); - &BODY_00_15(\$code,15,$D,$E,$T,$A,$B,$C); - - &BODY_16_19(\$code,16,$C,$D,$E,$T,$A,$B); - &BODY_16_19(\$code,17,$B,$C,$D,$E,$T,$A); - &BODY_16_19(\$code,18,$A,$B,$C,$D,$E,$T); - &BODY_16_19(\$code,19,$T,$A,$B,$C,$D,$E); - - &BODY_20_39(\$code,20,$E,$T,$A,$B,$C,$D); - &BODY_20_39(\$code,21,$D,$E,$T,$A,$B,$C); - &BODY_20_39(\$code,22,$C,$D,$E,$T,$A,$B); - &BODY_20_39(\$code,23,$B,$C,$D,$E,$T,$A); - &BODY_20_39(\$code,24,$A,$B,$C,$D,$E,$T); - &BODY_20_39(\$code,25,$T,$A,$B,$C,$D,$E); - &BODY_20_39(\$code,26,$E,$T,$A,$B,$C,$D); - &BODY_20_39(\$code,27,$D,$E,$T,$A,$B,$C); - &BODY_20_39(\$code,28,$C,$D,$E,$T,$A,$B); - &BODY_20_39(\$code,29,$B,$C,$D,$E,$T,$A); - &BODY_20_39(\$code,30,$A,$B,$C,$D,$E,$T); - &BODY_20_39(\$code,31,$T,$A,$B,$C,$D,$E); - &BODY_20_39(\$code,32,$E,$T,$A,$B,$C,$D); - &BODY_20_39(\$code,33,$D,$E,$T,$A,$B,$C); - &BODY_20_39(\$code,34,$C,$D,$E,$T,$A,$B); - &BODY_20_39(\$code,35,$B,$C,$D,$E,$T,$A); - &BODY_20_39(\$code,36,$A,$B,$C,$D,$E,$T); - &BODY_20_39(\$code,37,$T,$A,$B,$C,$D,$E); - &BODY_20_39(\$code,38,$E,$T,$A,$B,$C,$D); - &BODY_20_39(\$code,39,$D,$E,$T,$A,$B,$C); - - &BODY_40_59(\$code,40,$C,$D,$E,$T,$A,$B); - &BODY_40_59(\$code,41,$B,$C,$D,$E,$T,$A); - &BODY_40_59(\$code,42,$A,$B,$C,$D,$E,$T); - &BODY_40_59(\$code,43,$T,$A,$B,$C,$D,$E); - &BODY_40_59(\$code,44,$E,$T,$A,$B,$C,$D); - &BODY_40_59(\$code,45,$D,$E,$T,$A,$B,$C); - &BODY_40_59(\$code,46,$C,$D,$E,$T,$A,$B); - &BODY_40_59(\$code,47,$B,$C,$D,$E,$T,$A); - &BODY_40_59(\$code,48,$A,$B,$C,$D,$E,$T); - &BODY_40_59(\$code,49,$T,$A,$B,$C,$D,$E); - &BODY_40_59(\$code,50,$E,$T,$A,$B,$C,$D); - &BODY_40_59(\$code,51,$D,$E,$T,$A,$B,$C); - &BODY_40_59(\$code,52,$C,$D,$E,$T,$A,$B); - &BODY_40_59(\$code,53,$B,$C,$D,$E,$T,$A); - &BODY_40_59(\$code,54,$A,$B,$C,$D,$E,$T); - &BODY_40_59(\$code,55,$T,$A,$B,$C,$D,$E); - &BODY_40_59(\$code,56,$E,$T,$A,$B,$C,$D); - &BODY_40_59(\$code,57,$D,$E,$T,$A,$B,$C); - &BODY_40_59(\$code,58,$C,$D,$E,$T,$A,$B); - &BODY_40_59(\$code,59,$B,$C,$D,$E,$T,$A); - - &BODY_60_79(\$code,60,$A,$B,$C,$D,$E,$T); - &BODY_60_79(\$code,61,$T,$A,$B,$C,$D,$E); - &BODY_60_79(\$code,62,$E,$T,$A,$B,$C,$D); - &BODY_60_79(\$code,63,$D,$E,$T,$A,$B,$C); - &BODY_60_79(\$code,64,$C,$D,$E,$T,$A,$B); - &BODY_60_79(\$code,65,$B,$C,$D,$E,$T,$A); - &BODY_60_79(\$code,66,$A,$B,$C,$D,$E,$T); - &BODY_60_79(\$code,67,$T,$A,$B,$C,$D,$E); - &BODY_60_79(\$code,68,$E,$T,$A,$B,$C,$D); - &BODY_60_79(\$code,69,$D,$E,$T,$A,$B,$C); - &BODY_60_79(\$code,70,$C,$D,$E,$T,$A,$B); - &BODY_60_79(\$code,71,$B,$C,$D,$E,$T,$A); - &BODY_60_79(\$code,72,$A,$B,$C,$D,$E,$T); - &BODY_60_79(\$code,73,$T,$A,$B,$C,$D,$E); - &BODY_60_79(\$code,74,$E,$T,$A,$B,$C,$D); - &BODY_60_79(\$code,75,$D,$E,$T,$A,$B,$C); - &BODY_60_79(\$code,76,$C,$D,$E,$T,$A,$B); - &BODY_60_79(\$code,77,$B,$C,$D,$E,$T,$A); - &BODY_60_79(\$code,78,$A,$B,$C,$D,$E,$T); - &BODY_60_79(\$code,79,$T,$A,$B,$C,$D,$E); - -$code.=<<___; -{ .mmb; add $h0=$h0,$E - nop.m 0 - br.ctop.dptk.many .Lhtop };; -.Lhend: -{ .mmi; add tmp0=4,ctx - mov ar.lc=r3 };; -{ .mmi; st4 [ctx]=$h0,8 - st4 [tmp0]=$h1,8 };; -{ .mmi; st4 [ctx]=$h2,8 - st4 [tmp0]=$h3 };; -{ .mib; st4 [ctx]=$h4,-16 - mov pr=r2,0x1ffff - br.ret.sptk.many b0 };; -.endp sha1_block_asm_host_order# -___ - - -$code.=<<___; -// void sha1_block_asm_data_order(SHA_CTX *c,const void *p,size_t num); -.global sha1_block_asm_data_order# -.proc sha1_block_asm_data_order# -.align 32 -sha1_block_asm_data_order: -___ -$code.=<<___ if ($big_endian); -{ .mmi; and r2=3,inp };; -{ .mib; cmp.eq p6,p0=r0,r2 -(p6) br.dptk.many sha1_block_asm_host_order };; -___ -$code.=<<___; - .prologue - .fframe 0 - .save ar.pfs,r0 .save ar.lc,r3 -{ .mmi; alloc tmp1=ar.pfs,3,15,0,0 - $ADDP tmp0=4,ctx mov r3=ar.lc } { .mmi; $ADDP ctx=0,ctx $ADDP inp=0,inp @@ -444,90 +273,16 @@ tmp6=loc14; ___ - &BODY_00_15(\$code, 0,$A,$B,$C,$D,$E,$T,1); - &BODY_00_15(\$code, 1,$T,$A,$B,$C,$D,$E,1); - &BODY_00_15(\$code, 2,$E,$T,$A,$B,$C,$D,1); - &BODY_00_15(\$code, 3,$D,$E,$T,$A,$B,$C,1); - &BODY_00_15(\$code, 4,$C,$D,$E,$T,$A,$B,1); - &BODY_00_15(\$code, 5,$B,$C,$D,$E,$T,$A,1); - &BODY_00_15(\$code, 6,$A,$B,$C,$D,$E,$T,1); - &BODY_00_15(\$code, 7,$T,$A,$B,$C,$D,$E,1); - &BODY_00_15(\$code, 8,$E,$T,$A,$B,$C,$D,1); - &BODY_00_15(\$code, 9,$D,$E,$T,$A,$B,$C,1); - &BODY_00_15(\$code,10,$C,$D,$E,$T,$A,$B,1); - &BODY_00_15(\$code,11,$B,$C,$D,$E,$T,$A,1); - &BODY_00_15(\$code,12,$A,$B,$C,$D,$E,$T,1); - &BODY_00_15(\$code,13,$T,$A,$B,$C,$D,$E,1); - &BODY_00_15(\$code,14,$E,$T,$A,$B,$C,$D,1); - &BODY_00_15(\$code,15,$D,$E,$T,$A,$B,$C,1); - - &BODY_16_19(\$code,16,$C,$D,$E,$T,$A,$B); - &BODY_16_19(\$code,17,$B,$C,$D,$E,$T,$A); - &BODY_16_19(\$code,18,$A,$B,$C,$D,$E,$T); - &BODY_16_19(\$code,19,$T,$A,$B,$C,$D,$E); +{ my $i,@V=($A,$B,$C,$D,$E,$T); - &BODY_20_39(\$code,20,$E,$T,$A,$B,$C,$D); - &BODY_20_39(\$code,21,$D,$E,$T,$A,$B,$C); - &BODY_20_39(\$code,22,$C,$D,$E,$T,$A,$B); - &BODY_20_39(\$code,23,$B,$C,$D,$E,$T,$A); - &BODY_20_39(\$code,24,$A,$B,$C,$D,$E,$T); - &BODY_20_39(\$code,25,$T,$A,$B,$C,$D,$E); - &BODY_20_39(\$code,26,$E,$T,$A,$B,$C,$D); - &BODY_20_39(\$code,27,$D,$E,$T,$A,$B,$C); - &BODY_20_39(\$code,28,$C,$D,$E,$T,$A,$B); - &BODY_20_39(\$code,29,$B,$C,$D,$E,$T,$A); - &BODY_20_39(\$code,30,$A,$B,$C,$D,$E,$T); - &BODY_20_39(\$code,31,$T,$A,$B,$C,$D,$E); - &BODY_20_39(\$code,32,$E,$T,$A,$B,$C,$D); - &BODY_20_39(\$code,33,$D,$E,$T,$A,$B,$C); - &BODY_20_39(\$code,34,$C,$D,$E,$T,$A,$B); - &BODY_20_39(\$code,35,$B,$C,$D,$E,$T,$A); - &BODY_20_39(\$code,36,$A,$B,$C,$D,$E,$T); - &BODY_20_39(\$code,37,$T,$A,$B,$C,$D,$E); - &BODY_20_39(\$code,38,$E,$T,$A,$B,$C,$D); - &BODY_20_39(\$code,39,$D,$E,$T,$A,$B,$C); + for($i=0;$i<16;$i++) { &BODY_00_15(\$code,$i,@V); unshift(@V,pop(@V)); } + for(;$i<20;$i++) { &BODY_16_19(\$code,$i,@V); unshift(@V,pop(@V)); } + for(;$i<40;$i++) { &BODY_20_39(\$code,$i,@V); unshift(@V,pop(@V)); } + for(;$i<60;$i++) { &BODY_40_59(\$code,$i,@V); unshift(@V,pop(@V)); } + for(;$i<80;$i++) { &BODY_60_79(\$code,$i,@V); unshift(@V,pop(@V)); } - &BODY_40_59(\$code,40,$C,$D,$E,$T,$A,$B); - &BODY_40_59(\$code,41,$B,$C,$D,$E,$T,$A); - &BODY_40_59(\$code,42,$A,$B,$C,$D,$E,$T); - &BODY_40_59(\$code,43,$T,$A,$B,$C,$D,$E); - &BODY_40_59(\$code,44,$E,$T,$A,$B,$C,$D); - &BODY_40_59(\$code,45,$D,$E,$T,$A,$B,$C); - &BODY_40_59(\$code,46,$C,$D,$E,$T,$A,$B); - &BODY_40_59(\$code,47,$B,$C,$D,$E,$T,$A); - &BODY_40_59(\$code,48,$A,$B,$C,$D,$E,$T); - &BODY_40_59(\$code,49,$T,$A,$B,$C,$D,$E); - &BODY_40_59(\$code,50,$E,$T,$A,$B,$C,$D); - &BODY_40_59(\$code,51,$D,$E,$T,$A,$B,$C); - &BODY_40_59(\$code,52,$C,$D,$E,$T,$A,$B); - &BODY_40_59(\$code,53,$B,$C,$D,$E,$T,$A); - &BODY_40_59(\$code,54,$A,$B,$C,$D,$E,$T); - &BODY_40_59(\$code,55,$T,$A,$B,$C,$D,$E); - &BODY_40_59(\$code,56,$E,$T,$A,$B,$C,$D); - &BODY_40_59(\$code,57,$D,$E,$T,$A,$B,$C); - &BODY_40_59(\$code,58,$C,$D,$E,$T,$A,$B); - &BODY_40_59(\$code,59,$B,$C,$D,$E,$T,$A); - - &BODY_60_79(\$code,60,$A,$B,$C,$D,$E,$T); - &BODY_60_79(\$code,61,$T,$A,$B,$C,$D,$E); - &BODY_60_79(\$code,62,$E,$T,$A,$B,$C,$D); - &BODY_60_79(\$code,63,$D,$E,$T,$A,$B,$C); - &BODY_60_79(\$code,64,$C,$D,$E,$T,$A,$B); - &BODY_60_79(\$code,65,$B,$C,$D,$E,$T,$A); - &BODY_60_79(\$code,66,$A,$B,$C,$D,$E,$T); - &BODY_60_79(\$code,67,$T,$A,$B,$C,$D,$E); - &BODY_60_79(\$code,68,$E,$T,$A,$B,$C,$D); - &BODY_60_79(\$code,69,$D,$E,$T,$A,$B,$C); - &BODY_60_79(\$code,70,$C,$D,$E,$T,$A,$B); - &BODY_60_79(\$code,71,$B,$C,$D,$E,$T,$A); - &BODY_60_79(\$code,72,$A,$B,$C,$D,$E,$T); - &BODY_60_79(\$code,73,$T,$A,$B,$C,$D,$E); - &BODY_60_79(\$code,74,$E,$T,$A,$B,$C,$D); - &BODY_60_79(\$code,75,$D,$E,$T,$A,$B,$C); - &BODY_60_79(\$code,76,$C,$D,$E,$T,$A,$B); - &BODY_60_79(\$code,77,$B,$C,$D,$E,$T,$A); - &BODY_60_79(\$code,78,$A,$B,$C,$D,$E,$T); - &BODY_60_79(\$code,79,$T,$A,$B,$C,$D,$E); + (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check +} $code.=<<___; { .mmb; add $h0=$h0,$E @@ -543,7 +298,8 @@ $code.=<<___; { .mib; st4 [ctx]=$h4,-16 mov pr=r2,0x1ffff br.ret.sptk.many b0 };; -.endp sha1_block_asm_data_order# +.endp sha1_block_data_order# +stringz "SHA1 block transform for IA64, CRYPTOGAMS by " ___ print $code; diff --git a/crypto/sha/asm/sha512-ia64.pl b/crypto/sha/asm/sha512-ia64.pl index 0aea02399a..1c6ce56522 100755 --- a/crypto/sha/asm/sha512-ia64.pl +++ b/crypto/sha/asm/sha512-ia64.pl @@ -2,8 +2,9 @@ # # ==================================================================== # Written by Andy Polyakov for the OpenSSL -# project. Rights for redistribution and usage in source and binary -# forms are granted according to the OpenSSL license. +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # SHA256/512_Transform for Itanium. @@ -71,7 +72,7 @@ if ($output =~ /512.*\.[s|asm]/) { $ADD="add"; $SHRU="shr.u"; $TABLE="K512"; - $func="sha512_block"; + $func="sha512_block_data_order"; @Sigma0=(28,34,39); @Sigma1=(14,18,41); @sigma0=(1, 8, 7); @@ -85,7 +86,7 @@ if ($output =~ /512.*\.[s|asm]/) { $ADD="padd4"; $SHRU="pshr4.u"; $TABLE="K256"; - $func="sha256_block"; + $func="sha256_block_data_order"; @Sigma0=( 2,13,22); @Sigma1=( 6,11,25); @sigma0=( 7,18, 3); @@ -105,11 +106,13 @@ if (!defined($big_endian)) { $big_endian=(unpack('L',pack('N',1))==1); } $code=<<___; -.ident \"$output, version 1.0\" +.ident \"$output, version 1.1\" .ident \"IA-64 ISA artwork by Andy Polyakov \" .explicit .text +pfssave=r2; +lcsave=r3; prsave=r14; K=r15; A=r16; B=r17; C=r18; D=r19; @@ -121,6 +124,8 @@ ctx=r31; // 1st arg input=r48; // 2nd arg num=r49; // 3rd arg sgm0=r50; sgm1=r51; // small constants +A_=r54; B_=r55; C_=r56; D_=r57; +E_=r58; F_=r59; G_=r60; H_=r61; // void $func (SHA_CTX *ctx, const void *in,size_t num[,int host]) .global $func# @@ -128,82 +133,319 @@ sgm0=r50; sgm1=r51; // small constants .align 32 $func: .prologue - .fframe 0 - .save ar.pfs,r2 - .save ar.lc,r3 - .save pr,prsave -{ .mmi; alloc r2=ar.pfs,3,17,0,16 + .save ar.pfs,pfssave +{ .mmi; alloc pfssave=ar.pfs,3,27,0,16 $ADDP ctx=0,r32 // 1st arg - mov r3=ar.lc } + .save ar.lc,lcsave + mov lcsave=ar.lc } { .mmi; $ADDP input=0,r33 // 2nd arg - addl Ktbl=\@ltoff($TABLE#),gp + mov num=r34 // 3rd arg + .save pr,prsave mov prsave=pr };; .body -{ .mii; ld8 Ktbl=[Ktbl] - mov num=r34 };; // 3rd arg - { .mib; add r8=0*$SZ,ctx add r9=1*$SZ,ctx - brp.loop.imp .L_first16,.L_first16_ctop - } + brp.loop.imp .L_first16,.L_first16_end-16 } { .mib; add r10=2*$SZ,ctx add r11=3*$SZ,ctx - brp.loop.imp .L_rest,.L_rest_ctop - };; -// load A-H -{ .mmi; $LDW A=[r8],4*$SZ - $LDW B=[r9],4*$SZ - mov sgm0=$sigma0[2] } -{ .mmi; $LDW C=[r10],4*$SZ - $LDW D=[r11],4*$SZ - mov sgm1=$sigma1[2] };; -{ .mmi; $LDW E=[r8] - $LDW F=[r9] } -{ .mmi; $LDW G=[r10] - $LDW H=[r11] - cmp.ne p15,p14=0,r35 };; // used in sha256_block + brp.loop.imp .L_rest,.L_rest_end-16 };; +// load A-H +.Lpic_point: +{ .mmi; $LDW A_=[r8],4*$SZ + $LDW B_=[r9],4*$SZ + mov Ktbl=ip } +{ .mmi; $LDW C_=[r10],4*$SZ + $LDW D_=[r11],4*$SZ + mov sgm0=$sigma0[2] };; +{ .mmi; $LDW E_=[r8] + $LDW F_=[r9] + add Ktbl=($TABLE#-.Lpic_point),Ktbl } +{ .mmi; $LDW G_=[r10] + $LDW H_=[r11] + cmp.ne p0,p16=0,r0 };; // used in sha256_block +___ +$code.=<<___ if ($BITS==64); +{ .mii; and r8=7,input + and input=~7,input;; + cmp.eq p9,p0=1,r8 } +{ .mmi; cmp.eq p10,p0=2,r8 + cmp.eq p11,p0=3,r8 + cmp.eq p12,p0=4,r8 } +{ .mmi; cmp.eq p13,p0=5,r8 + cmp.eq p14,p0=6,r8 + cmp.eq p15,p0=7,r8 };; +___ +$code.=<<___; .L_outer: -{ .mii; mov ar.lc=15 - mov ar.ec=1 };; -.align 32 -.L_first16: .rotr X[16] +{ .mmi; mov A=A_ + mov B=B_ + mov ar.lc=14 } +{ .mmi; mov C=C_ + mov D=D_ + mov E=E_ } +{ .mmi; mov F=F_ + mov G=G_ + mov ar.ec=2 } +{ .mmi; ld1 X[15]=[input],$SZ // eliminated in 64-bit + mov H=H_ + mov sgm1=$sigma1[2] };; + ___ $t0="t0", $t1="t1", $code.=<<___ if ($BITS==32); -{ .mib; (p14) add r9=1,input - (p14) add r10=2,input } -{ .mib; (p14) add r11=3,input - (p15) br.dptk.few .L_host };; -{ .mmi; (p14) ld1 r8=[input],$SZ - (p14) ld1 r9=[r9] } -{ .mmi; (p14) ld1 r10=[r10] - (p14) ld1 r11=[r11] };; -{ .mii; (p14) dep r9=r8,r9,8,8 - (p14) dep r11=r10,r11,8,8 };; -{ .mib; (p14) dep X[15]=r9,r11,16,16 };; -.L_host: -{ .mib; (p15) $LDW X[15]=[input],$SZ // X[i]=*input++ +.align 32 +.L_first16: +{ .mmi; add r9=1-$SZ,input + add r10=2-$SZ,input + add r11=3-$SZ,input };; +{ .mmi; ld1 r9=[r9] + ld1 r10=[r10] dep.z $t1=E,32,32 } -{ .mib; $LDW K=[Ktbl],$SZ +{ .mmi; $LDW K=[Ktbl],$SZ + ld1 r11=[r11] zxt4 E=E };; -{ .mmi; or $t1=$t1,E - and T1=F,E - and T2=A,B } +{ .mii; or $t1=$t1,E + dep X[15]=X[15],r9,8,8 + dep r11=r10,r11,8,8 };; +{ .mmi; and T1=F,E + and T2=A,B + dep X[15]=X[15],r11,16,16 } { .mmi; andcm r8=G,E and r9=A,C mux2 $t0=A,0x44 };; // copy lower half to upper -{ .mib; xor T1=T1,r8 // T1=((e & f) ^ (~e & g)) +{ .mmi; (p16) ld1 X[15-1]=[input],$SZ // prefetch + xor T1=T1,r8 // T1=((e & f) ^ (~e & g)) _rotr r11=$t1,$Sigma1[0] } // ROTR(e,14) { .mib; and r10=B,C xor T2=T2,r9 };; ___ $t0="A", $t1="E", $code.=<<___ if ($BITS==64); -{ .mmi; $LDW X[15]=[input],$SZ // X[i]=*input++ +// in 64-bit mode I load whole X[16] at once and take care of alignment... +{ .mmi; add r8=1*$SZ,input + add r9=2*$SZ,input + add r10=3*$SZ,input };; +{ .mmb; $LDW X[15]=[input],4*$SZ + $LDW X[14]=[r8],4*$SZ +(p9) br.cond.dpnt.many .L1byte };; +{ .mmb; $LDW X[13]=[r9],4*$SZ + $LDW X[12]=[r10],4*$SZ +(p10) br.cond.dpnt.many .L2byte };; +{ .mmb; $LDW X[11]=[input],4*$SZ + $LDW X[10]=[r8],4*$SZ +(p11) br.cond.dpnt.many .L3byte };; +{ .mmb; $LDW X[ 9]=[r9],4*$SZ + $LDW X[ 8]=[r10],4*$SZ +(p12) br.cond.dpnt.many .L4byte };; +{ .mmb; $LDW X[ 7]=[input],4*$SZ + $LDW X[ 6]=[r8],4*$SZ +(p13) br.cond.dpnt.many .L5byte };; +{ .mmb; $LDW X[ 5]=[r9],4*$SZ + $LDW X[ 4]=[r10],4*$SZ +(p14) br.cond.dpnt.many .L6byte };; +{ .mmb; $LDW X[ 3]=[input],4*$SZ + $LDW X[ 2]=[r8],4*$SZ +(p15) br.cond.dpnt.many .L7byte };; +{ .mmb; $LDW X[ 1]=[r9],4*$SZ + $LDW X[ 0]=[r10],4*$SZ + br.many .L_first16 };; +.L1byte: +{ .mmi; $LDW X[13]=[r9],4*$SZ + $LDW X[12]=[r10],4*$SZ + shrp X[15]=X[15],X[14],56 };; +{ .mmi; $LDW X[11]=[input],4*$SZ + $LDW X[10]=[r8],4*$SZ + shrp X[14]=X[14],X[13],56 } +{ .mmi; $LDW X[ 9]=[r9],4*$SZ + $LDW X[ 8]=[r10],4*$SZ + shrp X[13]=X[13],X[12],56 };; +{ .mmi; $LDW X[ 7]=[input],4*$SZ + $LDW X[ 6]=[r8],4*$SZ + shrp X[12]=X[12],X[11],56 } +{ .mmi; $LDW X[ 5]=[r9],4*$SZ + $LDW X[ 4]=[r10],4*$SZ + shrp X[11]=X[11],X[10],56 };; +{ .mmi; $LDW X[ 3]=[input],4*$SZ + $LDW X[ 2]=[r8],4*$SZ + shrp X[10]=X[10],X[ 9],56 } +{ .mmi; $LDW X[ 1]=[r9],4*$SZ + $LDW X[ 0]=[r10],4*$SZ + shrp X[ 9]=X[ 9],X[ 8],56 };; +{ .mii; $LDW T1=[input] + shrp X[ 8]=X[ 8],X[ 7],56 + shrp X[ 7]=X[ 7],X[ 6],56 } +{ .mii; shrp X[ 6]=X[ 6],X[ 5],56 + shrp X[ 5]=X[ 5],X[ 4],56 };; +{ .mii; shrp X[ 4]=X[ 4],X[ 3],56 + shrp X[ 3]=X[ 3],X[ 2],56 } +{ .mii; shrp X[ 2]=X[ 2],X[ 1],56 + shrp X[ 1]=X[ 1],X[ 0],56 } +{ .mib; shrp X[ 0]=X[ 0],T1,56 + br.many .L_first16 };; +.L2byte: +{ .mmi; $LDW X[11]=[input],4*$SZ + $LDW X[10]=[r8],4*$SZ + shrp X[15]=X[15],X[14],48 } +{ .mmi; $LDW X[ 9]=[r9],4*$SZ + $LDW X[ 8]=[r10],4*$SZ + shrp X[14]=X[14],X[13],48 };; +{ .mmi; $LDW X[ 7]=[input],4*$SZ + $LDW X[ 6]=[r8],4*$SZ + shrp X[13]=X[13],X[12],48 } +{ .mmi; $LDW X[ 5]=[r9],4*$SZ + $LDW X[ 4]=[r10],4*$SZ + shrp X[12]=X[12],X[11],48 };; +{ .mmi; $LDW X[ 3]=[input],4*$SZ + $LDW X[ 2]=[r8],4*$SZ + shrp X[11]=X[11],X[10],48 } +{ .mmi; $LDW X[ 1]=[r9],4*$SZ + $LDW X[ 0]=[r10],4*$SZ + shrp X[10]=X[10],X[ 9],48 };; +{ .mii; $LDW T1=[input] + shrp X[ 9]=X[ 9],X[ 8],48 + shrp X[ 8]=X[ 8],X[ 7],48 } +{ .mii; shrp X[ 7]=X[ 7],X[ 6],48 + shrp X[ 6]=X[ 6],X[ 5],48 };; +{ .mii; shrp X[ 5]=X[ 5],X[ 4],48 + shrp X[ 4]=X[ 4],X[ 3],48 } +{ .mii; shrp X[ 3]=X[ 3],X[ 2],48 + shrp X[ 2]=X[ 2],X[ 1],48 } +{ .mii; shrp X[ 1]=X[ 1],X[ 0],48 + shrp X[ 0]=X[ 0],T1,48 } +{ .mfb; br.many .L_first16 };; +.L3byte: +{ .mmi; $LDW X[ 9]=[r9],4*$SZ + $LDW X[ 8]=[r10],4*$SZ + shrp X[15]=X[15],X[14],40 };; +{ .mmi; $LDW X[ 7]=[input],4*$SZ + $LDW X[ 6]=[r8],4*$SZ + shrp X[14]=X[14],X[13],40 } +{ .mmi; $LDW X[ 5]=[r9],4*$SZ + $LDW X[ 4]=[r10],4*$SZ + shrp X[13]=X[13],X[12],40 };; +{ .mmi; $LDW X[ 3]=[input],4*$SZ + $LDW X[ 2]=[r8],4*$SZ + shrp X[12]=X[12],X[11],40 } +{ .mmi; $LDW X[ 1]=[r9],4*$SZ + $LDW X[ 0]=[r10],4*$SZ + shrp X[11]=X[11],X[10],40 };; +{ .mii; $LDW T1=[input] + shrp X[10]=X[10],X[ 9],40 + shrp X[ 9]=X[ 9],X[ 8],40 } +{ .mii; shrp X[ 8]=X[ 8],X[ 7],40 + shrp X[ 7]=X[ 7],X[ 6],40 };; +{ .mii; shrp X[ 6]=X[ 6],X[ 5],40 + shrp X[ 5]=X[ 5],X[ 4],40 } +{ .mii; shrp X[ 4]=X[ 4],X[ 3],40 + shrp X[ 3]=X[ 3],X[ 2],40 } +{ .mii; shrp X[ 2]=X[ 2],X[ 1],40 + shrp X[ 1]=X[ 1],X[ 0],40 } +{ .mib; shrp X[ 0]=X[ 0],T1,40 + br.many .L_first16 };; +.L4byte: +{ .mmi; $LDW X[ 7]=[input],4*$SZ + $LDW X[ 6]=[r8],4*$SZ + shrp X[15]=X[15],X[14],32 } +{ .mmi; $LDW X[ 5]=[r9],4*$SZ + $LDW X[ 4]=[r10],4*$SZ + shrp X[14]=X[14],X[13],32 };; +{ .mmi; $LDW X[ 3]=[input],4*$SZ + $LDW X[ 2]=[r8],4*$SZ + shrp X[13]=X[13],X[12],32 } +{ .mmi; $LDW X[ 1]=[r9],4*$SZ + $LDW X[ 0]=[r10],4*$SZ + shrp X[12]=X[12],X[11],32 };; +{ .mii; $LDW T1=[input] + shrp X[11]=X[11],X[10],32 + shrp X[10]=X[10],X[ 9],32 } +{ .mii; shrp X[ 9]=X[ 9],X[ 8],32 + shrp X[ 8]=X[ 8],X[ 7],32 };; +{ .mii; shrp X[ 7]=X[ 7],X[ 6],32 + shrp X[ 6]=X[ 6],X[ 5],32 } +{ .mii; shrp X[ 5]=X[ 5],X[ 4],32 + shrp X[ 4]=X[ 4],X[ 3],32 } +{ .mii; shrp X[ 3]=X[ 3],X[ 2],32 + shrp X[ 2]=X[ 2],X[ 1],32 } +{ .mii; shrp X[ 1]=X[ 1],X[ 0],32 + shrp X[ 0]=X[ 0],T1,32 } +{ .mfb; br.many .L_first16 };; +.L5byte: +{ .mmi; $LDW X[ 5]=[r9],4*$SZ + $LDW X[ 4]=[r10],4*$SZ + shrp X[15]=X[15],X[14],24 };; +{ .mmi; $LDW X[ 3]=[input],4*$SZ + $LDW X[ 2]=[r8],4*$SZ + shrp X[14]=X[14],X[13],24 } +{ .mmi; $LDW X[ 1]=[r9],4*$SZ + $LDW X[ 0]=[r10],4*$SZ + shrp X[13]=X[13],X[12],24 };; +{ .mii; $LDW T1=[input] + shrp X[12]=X[12],X[11],24 + shrp X[11]=X[11],X[10],24 } +{ .mii; shrp X[10]=X[10],X[ 9],24 + shrp X[ 9]=X[ 9],X[ 8],24 };; +{ .mii; shrp X[ 8]=X[ 8],X[ 7],24 + shrp X[ 7]=X[ 7],X[ 6],24 } +{ .mii; shrp X[ 6]=X[ 6],X[ 5],24 + shrp X[ 5]=X[ 5],X[ 4],24 } +{ .mii; shrp X[ 4]=X[ 4],X[ 3],24 + shrp X[ 3]=X[ 3],X[ 2],24 } +{ .mii; shrp X[ 2]=X[ 2],X[ 1],24 + shrp X[ 1]=X[ 1],X[ 0],24 } +{ .mib; shrp X[ 0]=X[ 0],T1,24 + br.many .L_first16 };; +.L6byte: +{ .mmi; $LDW X[ 3]=[input],4*$SZ + $LDW X[ 2]=[r8],4*$SZ + shrp X[15]=X[15],X[14],16 } +{ .mmi; $LDW X[ 1]=[r9],4*$SZ + $LDW X[ 0]=[r10],4*$SZ + shrp X[14]=X[14],X[13],16 };; +{ .mii; $LDW T1=[input] + shrp X[13]=X[13],X[12],16 + shrp X[12]=X[12],X[11],16 } +{ .mii; shrp X[11]=X[11],X[10],16 + shrp X[10]=X[10],X[ 9],16 };; +{ .mii; shrp X[ 9]=X[ 9],X[ 8],16 + shrp X[ 8]=X[ 8],X[ 7],16 } +{ .mii; shrp X[ 7]=X[ 7],X[ 6],16 + shrp X[ 6]=X[ 6],X[ 5],16 } +{ .mii; shrp X[ 5]=X[ 5],X[ 4],16 + shrp X[ 4]=X[ 4],X[ 3],16 } +{ .mii; shrp X[ 3]=X[ 3],X[ 2],16 + shrp X[ 2]=X[ 2],X[ 1],16 } +{ .mii; shrp X[ 1]=X[ 1],X[ 0],16 + shrp X[ 0]=X[ 0],T1,16 } +{ .mfb; br.many .L_first16 };; +.L7byte: +{ .mmi; $LDW X[ 1]=[r9],4*$SZ + $LDW X[ 0]=[r10],4*$SZ + shrp X[15]=X[15],X[14],8 };; +{ .mii; $LDW T1=[input] + shrp X[14]=X[14],X[13],8 + shrp X[13]=X[13],X[12],8 } +{ .mii; shrp X[12]=X[12],X[11],8 + shrp X[11]=X[11],X[10],8 };; +{ .mii; shrp X[10]=X[10],X[ 9],8 + shrp X[ 9]=X[ 9],X[ 8],8 } +{ .mii; shrp X[ 8]=X[ 8],X[ 7],8 + shrp X[ 7]=X[ 7],X[ 6],8 } +{ .mii; shrp X[ 6]=X[ 6],X[ 5],8 + shrp X[ 5]=X[ 5],X[ 4],8 } +{ .mii; shrp X[ 4]=X[ 4],X[ 3],8 + shrp X[ 3]=X[ 3],X[ 2],8 } +{ .mii; shrp X[ 2]=X[ 2],X[ 1],8 + shrp X[ 1]=X[ 1],X[ 0],8 } +{ .mib; shrp X[ 0]=X[ 0],T1,8 + br.many .L_first16 };; + +.align 32 +.L_first16: +{ .mmi; $LDW K=[Ktbl],$SZ and T1=F,E and T2=A,B } -{ .mmi; $LDW K=[Ktbl],$SZ +{ .mmi; //$LDW X[15]=[input],$SZ // X[i]=*input++ andcm r8=G,E and r9=A,C };; { .mmi; xor T1=T1,r8 //T1=((e & f) ^ (~e & g)) @@ -236,13 +478,14 @@ $code.=<<___; { .mmi; xor r10=r8,r10 // r10=Sigma0(a) mov B=A add A=T1,T2 };; -.L_first16_ctop: { .mib; add E=E,T1 add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a) br.ctop.sptk .L_first16 };; +.L_first16_end: + +{ .mii; mov ar.lc=$rounds-17 + mov ar.ec=1 };; -{ .mib; mov ar.lc=$rounds-17 } -{ .mib; mov ar.ec=1 };; .align 32 .L_rest: .rotr X[16] @@ -311,46 +554,38 @@ $code.=<<___; { .mmi; xor r10=r8,r10 // r10=Sigma0(a) mov B=A add A=T1,T2 };; -.L_rest_ctop: { .mib; add E=E,T1 add A=A,r10 // T2=Maj(a,b,c)+Sigma0(a) br.ctop.sptk .L_rest };; +.L_rest_end: + +{ .mmi; add A_=A_,A + add B_=B_,B + add C_=C_,C } +{ .mmi; add D_=D_,D + add E_=E_,E + cmp.ltu p16,p0=1,num };; +{ .mmi; add F_=F_,F + add G_=G_,G + add H_=H_,H } +{ .mmb; add Ktbl=-$SZ*$rounds,Ktbl +(p16) add num=-1,num +(p16) br.dptk.many .L_outer };; { .mib; add r8=0*$SZ,ctx add r9=1*$SZ,ctx } { .mib; add r10=2*$SZ,ctx add r11=3*$SZ,ctx };; -{ .mmi; $LDW r32=[r8],4*$SZ - $LDW r33=[r9],4*$SZ } -{ .mmi; $LDW r34=[r10],4*$SZ - $LDW r35=[r11],4*$SZ - cmp.ltu p6,p7=1,num };; -{ .mmi; $LDW r36=[r8],-4*$SZ - $LDW r37=[r9],-4*$SZ -(p6) add Ktbl=-$SZ*$rounds,Ktbl } -{ .mmi; $LDW r38=[r10],-4*$SZ - $LDW r39=[r11],-4*$SZ -(p7) mov ar.lc=r3 };; -{ .mmi; add A=A,r32 - add B=B,r33 - add C=C,r34 } -{ .mmi; add D=D,r35 - add E=E,r36 - add F=F,r37 };; -{ .mmi; $STW [r8]=A,4*$SZ - $STW [r9]=B,4*$SZ - add G=G,r38 } -{ .mmi; $STW [r10]=C,4*$SZ - $STW [r11]=D,4*$SZ - add H=H,r39 };; -{ .mmi; $STW [r8]=E - $STW [r9]=F -(p6) add num=-1,num } -{ .mmb; $STW [r10]=G - $STW [r11]=H -(p6) br.dptk.many .L_outer };; - -{ .mib; mov pr=prsave,0x1ffff +{ .mmi; $STW [r8]=A_,4*$SZ + $STW [r9]=B_,4*$SZ + mov ar.lc=lcsave } +{ .mmi; $STW [r10]=C_,4*$SZ + $STW [r11]=D_,4*$SZ + mov pr=prsave,0x1ffff };; +{ .mmb; $STW [r8]=E_ + $STW [r9]=F_ } +{ .mmb; $STW [r10]=G_ + $STW [r11]=H_ br.ret.sptk.many b0 };; .endp $func# ___ @@ -359,7 +594,10 @@ $code =~ s/\`([^\`]*)\`/eval $1/gem; $code =~ s/_rotr(\s+)([^=]+)=([^,]+),([0-9]+)/shrp$1$2=$3,$3,$4/gm; if ($BITS==64) { $code =~ s/mux2(\s+)\S+/nop.i$1 0x0/gm; - $code =~ s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian); + $code =~ s/mux1(\s+)\S+/nop.i$1 0x0/gm if ($big_endian); + $code =~ s/(shrp\s+X\[[^=]+)=([^,]+),([^,]+),([1-9]+)/$1=$3,$2,64-$4/gm + if (!$big_endian); + $code =~ s/ld1(\s+)X\[\S+/nop.m$1 0x0/gm; } print $code; @@ -384,6 +622,7 @@ K256: data4 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 data4 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 data4 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .size K256#,$SZ*$rounds +stringz "SHA256 block transform for IA64, CRYPTOGAMS by " ___ print<<___ if ($BITS==64); .align 64 @@ -429,4 +668,5 @@ K512: data8 0x428a2f98d728ae22,0x7137449123ef65cd data8 0x4cc5d4becb3e42b6,0x597f299cfc657e2a data8 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .size K512#,$SZ*$rounds +stringz "SHA512 block transform for IA64, CRYPTOGAMS by " ___ diff --git a/crypto/sha/sha256.c b/crypto/sha/sha256.c index c1cdf089e1..e5b8d8d2b1 100644 --- a/crypto/sha/sha256.c +++ b/crypto/sha/sha256.c @@ -76,17 +76,11 @@ int SHA224_Update(SHA256_CTX *c, const void *data, size_t len) int SHA224_Final (unsigned char *md, SHA256_CTX *c) { return SHA256_Final (md,c); } -#ifndef SHA_LONG_LOG2 -#define SHA_LONG_LOG2 2 /* default to 32 bits */ -#endif - #define DATA_ORDER_IS_BIG_ENDIAN #define HASH_LONG SHA_LONG -#define HASH_LONG_LOG2 SHA_LONG_LOG2 #define HASH_CTX SHA256_CTX #define HASH_CBLOCK SHA_CBLOCK -#define HASH_LBLOCK SHA_LBLOCK /* * Note that FIPS180-2 discusses "Truncation of the Hash Function Output." * default: case below covers for it. It's not clear however if it's @@ -119,16 +113,15 @@ int SHA224_Final (unsigned char *md, SHA256_CTX *c) #define HASH_UPDATE SHA256_Update #define HASH_TRANSFORM SHA256_Transform #define HASH_FINAL SHA256_Final -#define HASH_BLOCK_HOST_ORDER sha256_block_host_order #define HASH_BLOCK_DATA_ORDER sha256_block_data_order -void sha256_block_host_order (SHA256_CTX *ctx, const void *in, size_t num); +#ifndef SHA256_ASM +static +#endif void sha256_block_data_order (SHA256_CTX *ctx, const void *in, size_t num); #include "md32_common.h" -#ifdef SHA256_ASM -void sha256_block (SHA256_CTX *ctx, const void *in, size_t num, int host); -#else +#ifndef SHA256_ASM static const SHA_LONG K256[64] = { 0x428a2f98UL,0x71374491UL,0xb5c0fbcfUL,0xe9b5dba5UL, 0x3956c25bUL,0x59f111f1UL,0x923f82a4UL,0xab1c5ed5UL, @@ -162,10 +155,10 @@ static const SHA_LONG K256[64] = { #ifdef OPENSSL_SMALL_FOOTPRINT -static void sha256_block (SHA256_CTX *ctx, const void *in, size_t num, int host) +static void sha256_block_data_order (SHA256_CTX *ctx, const void *in, size_t num) { unsigned MD32_REG_T a,b,c,d,e,f,g,h,s0,s1,T1,T2; - SHA_LONG X[16]; + SHA_LONG X[16],l; int i; const unsigned char *data=in; @@ -174,33 +167,13 @@ static void sha256_block (SHA256_CTX *ctx, const void *in, size_t num, int host) a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3]; e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7]; - if (host) - { - const SHA_LONG *W=(const SHA_LONG *)data; - - for (i=0;i<16;i++) - { - T1 = X[i] = W[i]; - T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i]; - T2 = Sigma0(a) + Maj(a,b,c); - h = g; g = f; f = e; e = d + T1; - d = c; c = b; b = a; a = T1 + T2; - } - - data += SHA256_CBLOCK; - } - else + for (i=0;i<16;i++) { - SHA_LONG l; - - for (i=0;i<16;i++) - { - HOST_c2l(data,l); T1 = X[i] = l; - T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i]; - T2 = Sigma0(a) + Maj(a,b,c); - h = g; g = f; f = e; e = d + T1; - d = c; c = b; b = a; a = T1 + T2; - } + HOST_c2l(data,l); T1 = X[i] = l; + T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i]; + T2 = Sigma0(a) + Maj(a,b,c); + h = g; g = f; f = e; e = d + T1; + d = c; c = b; b = a; a = T1 + T2; } for (;i<64;i++) @@ -234,19 +207,20 @@ static void sha256_block (SHA256_CTX *ctx, const void *in, size_t num, int host) T1 = X[(i)&0x0f] += s0 + s1 + X[(i+9)&0x0f]; \ ROUND_00_15(i,a,b,c,d,e,f,g,h); } while (0) -static void sha256_block (SHA256_CTX *ctx, const void *in, size_t num, int host) +static void sha256_block_data_order (SHA256_CTX *ctx, const void *in, size_t num) { unsigned MD32_REG_T a,b,c,d,e,f,g,h,s0,s1,T1; SHA_LONG X[16]; int i; const unsigned char *data=in; + const union { long one; char little; } is_endian = {1}; while (num--) { a = ctx->h[0]; b = ctx->h[1]; c = ctx->h[2]; d = ctx->h[3]; e = ctx->h[4]; f = ctx->h[5]; g = ctx->h[6]; h = ctx->h[7]; - if (host) + if (!is_endian.little && sizeof(SHA_LONG)==4 && ((size_t)in%4)==0) { const SHA_LONG *W=(const SHA_LONG *)data; @@ -312,15 +286,4 @@ static void sha256_block (SHA256_CTX *ctx, const void *in, size_t num, int host) #endif #endif /* SHA256_ASM */ -/* - * Idea is to trade couple of cycles for some space. On IA-32 we save - * about 4K in "big footprint" case. In "small footprint" case any gain - * is appreciated:-) - */ -void HASH_BLOCK_HOST_ORDER (SHA256_CTX *ctx, const void *in, size_t num) -{ sha256_block (ctx,in,num,1); } - -void HASH_BLOCK_DATA_ORDER (SHA256_CTX *ctx, const void *in, size_t num) -{ sha256_block (ctx,in,num,0); } - #endif /* OPENSSL_NO_SHA256 */ diff --git a/crypto/sha/sha512.c b/crypto/sha/sha512.c index 45edbd2d2d..dabad88fda 100644 --- a/crypto/sha/sha512.c +++ b/crypto/sha/sha512.c @@ -53,7 +53,10 @@ const char SHA512_version[]="SHA-512" OPENSSL_VERSION_PTEXT; -#if defined(_M_IX86) || defined(_M_AMD64) || defined(__i386) || defined(__x86_64) +#if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \ + defined(__x86_64) || defined(_M_AMD64) || defined(_M_X64) || \ + defined(__s390__) || defined(__s390x__) || \ + defined(SHA512_ASM) #define SHA512_BLOCK_CAN_MANAGE_UNALIGNED_DATA #endif @@ -96,7 +99,7 @@ int SHA512_Init (SHA512_CTX *c) #ifndef SHA512_ASM static #endif -void sha512_block (SHA512_CTX *ctx, const void *in, size_t num); +void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num); int SHA512_Final (unsigned char *md, SHA512_CTX *c) { @@ -107,7 +110,7 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c) n++; if (n > (sizeof(c->u)-16)) memset (p+n,0,sizeof(c->u)-n), n=0, - sha512_block (c,p,1); + sha512_block_data_order (c,p,1); memset (p+n,0,sizeof(c->u)-16-n); #ifdef B_ENDIAN @@ -132,7 +135,7 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c) p[sizeof(c->u)-16] = (unsigned char)(c->Nh>>56); #endif - sha512_block (c,p,1); + sha512_block_data_order (c,p,1); if (md==0) return 0; @@ -204,7 +207,7 @@ int SHA512_Update (SHA512_CTX *c, const void *_data, size_t len) else { memcpy (p+c->num,data,n), c->num = 0; len-=n, data+=n; - sha512_block (c,p,1); + sha512_block_data_order (c,p,1); } } @@ -214,12 +217,12 @@ int SHA512_Update (SHA512_CTX *c, const void *_data, size_t len) if ((size_t)data%sizeof(c->u.d[0]) != 0) while (len >= sizeof(c->u)) memcpy (p,data,sizeof(c->u)), - sha512_block (c,p,1), + sha512_block_data_order (c,p,1), len -= sizeof(c->u), data += sizeof(c->u); else #endif - sha512_block (c,data,len/sizeof(c->u)), + sha512_block_data_order (c,data,len/sizeof(c->u)), data += len, len %= sizeof(c->u), data -= len; @@ -234,7 +237,7 @@ int SHA384_Update (SHA512_CTX *c, const void *data, size_t len) { return SHA512_Update (c,data,len); } void SHA512_Transform (SHA512_CTX *c, const unsigned char *data) -{ sha512_block (c,data,1); } +{ sha512_block_data_order (c,data,1); } unsigned char *SHA384(const unsigned char *d, size_t n, unsigned char *md) { @@ -308,40 +311,75 @@ static const SHA_LONG64 K512[80] = { #ifndef PEDANTIC # if defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) # if defined(__x86_64) || defined(__x86_64__) -# define PULL64(x) ({ SHA_LONG64 ret=*((const SHA_LONG64 *)(&(x))); \ - asm ("bswapq %0" \ - : "=r"(ret) \ - : "0"(ret)); ret; }) -# endif -# endif -#endif - -#ifndef PULL64 -#define B(x,j) (((SHA_LONG64)(*(((const unsigned char *)(&x))+j)))<<((7-j)*8)) -#define PULL64(x) (B(x,0)|B(x,1)|B(x,2)|B(x,3)|B(x,4)|B(x,5)|B(x,6)|B(x,7)) -#endif - -#ifndef PEDANTIC -# if defined(_MSC_VER) -# if defined(_WIN64) /* applies to both IA-64 and AMD64 */ -# define ROTR(a,n) _rotr64((a),n) -# endif -# elif defined(__GNUC__) && __GNUC__>=2 && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) -# if defined(__x86_64) || defined(__x86_64__) # define ROTR(a,n) ({ unsigned long ret; \ asm ("rorq %1,%0" \ : "=r"(ret) \ : "J"(n),"0"(a) \ : "cc"); ret; }) -# elif defined(_ARCH_PPC) && defined(__64BIT__) +# if !defined(B_ENDIAN) +# define PULL64(x) ({ SHA_LONG64 ret=*((const SHA_LONG64 *)(&(x))); \ + asm ("bswapq %0" \ + : "=r"(ret) \ + : "0"(ret)); ret; }) +# endif +# elif (defined(__i386) || defined(__i386__)) && !defined(B_ENDIAN) +# if defined(I386_ONLY) +# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\ + unsigned int hi,lo; \ + asm("xchgb %%ah,%%al;xchgb %%dh,%%dl;"\ + "roll $16,%%eax; roll $16,%%edx; "\ + "xchgb %%ah,%%al;xchgb %%dh,%%dl;" \ + : "=a"(lo),"=d"(hi) \ + : "0"(p[1]),"1"(p[0]) : "cc"); \ + ((SHA_LONG64)hi)<<32|lo; }) +# else +# define PULL64(x) ({ const unsigned int *p=(const unsigned int *)(&(x));\ + unsigned int hi,lo; \ + asm ("bswapl %0; bswapl %1;" \ + : "=r"(lo),"=r"(hi) \ + : "0"(p[1]),"1"(p[0])); \ + ((SHA_LONG64)hi)<<32|lo; }) +# endif +# elif (defined(_ARCH_PPC) && defined(__64BIT__)) || defined(_ARCH_PPC64) # define ROTR(a,n) ({ unsigned long ret; \ asm ("rotrdi %0,%1,%2" \ : "=r"(ret) \ : "r"(a),"K"(n)); ret; }) # endif +# elif defined(_MSC_VER) +# if defined(_WIN64) /* applies to both IA-64 and AMD64 */ +# define ROTR(a,n) _rotr64((a),n) +# endif +# if defined(_M_IX86) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) +# if defined(I386_ONLY) + static SHA_LONG64 __fastcall __pull64be(const void *x) + { _asm mov edx, [ecx + 0] + _asm mov eax, [ecx + 4] + _asm xchg dh,dl + _asm xchg ah,al + _asm rol edx,16 + _asm rol eax,16 + _asm xchg dh,dl + _asm xchg ah,al + } +# else + static SHA_LONG64 __fastcall __pull64be(const void *x) + { _asm mov edx, [ecx + 0] + _asm mov eax, [ecx + 4] + _asm bswap edx + _asm bswap eax + } +# endif +# define PULL64(x) __pull64be(&(x)) +# endif # endif #endif +#ifndef PULL64 +#define B(x,j) (((SHA_LONG64)(*(((const unsigned char *)(&x))+j)))<<((7-j)*8)) +#define PULL64(x) (B(x,0)|B(x,1)|B(x,2)|B(x,3)|B(x,4)|B(x,5)|B(x,6)|B(x,7)) +#endif + #ifndef ROTR #define ROTR(x,s) (((x)>>s) | (x)<<(64-s)) #endif @@ -364,7 +402,7 @@ static const SHA_LONG64 K512[80] = { #ifdef OPENSSL_SMALL_FOOTPRINT -static void sha512_block (SHA512_CTX *ctx, const void *in, size_t num) +static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num) { const SHA_LONG64 *W=in; SHA_LONG64 a,b,c,d,e,f,g,h,s0,s1,T1,T2; @@ -425,7 +463,7 @@ static void sha512_block (SHA512_CTX *ctx, const void *in, size_t num) T1 = X[(i)&0x0f] += s0 + s1 + X[(i+9)&0x0f]; \ ROUND_00_15(i,a,b,c,d,e,f,g,h); } while (0) -static void sha512_block (SHA512_CTX *ctx, const void *in, size_t num) +static void sha512_block_data_order (SHA512_CTX *ctx, const void *in, size_t num) { const SHA_LONG64 *W=in; SHA_LONG64 a,b,c,d,e,f,g,h,s0,s1,T1; diff --git a/crypto/sha/sha_locl.h b/crypto/sha/sha_locl.h index 3b1cea84e4..da46ddfe79 100644 --- a/crypto/sha/sha_locl.h +++ b/crypto/sha/sha_locl.h @@ -62,17 +62,11 @@ #include #include -#ifndef SHA_LONG_LOG2 -#define SHA_LONG_LOG2 2 /* default to 32 bits */ -#endif - #define DATA_ORDER_IS_BIG_ENDIAN #define HASH_LONG SHA_LONG -#define HASH_LONG_LOG2 SHA_LONG_LOG2 #define HASH_CTX SHA_CTX #define HASH_CBLOCK SHA_CBLOCK -#define HASH_LBLOCK SHA_LBLOCK #define HASH_MAKE_STRING(c,s) do { \ unsigned long ll; \ ll=(c)->h0; HOST_l2c(ll,(s)); \ @@ -88,12 +82,10 @@ # define HASH_TRANSFORM SHA_Transform # define HASH_FINAL SHA_Final # define HASH_INIT SHA_Init -# define HASH_BLOCK_HOST_ORDER sha_block_host_order # define HASH_BLOCK_DATA_ORDER sha_block_data_order # define Xupdate(a,ix,ia,ib,ic,id) (ix=(a)=(ia^ib^ic^id)) - void sha_block_host_order (SHA_CTX *c, const void *p,size_t num); - void sha_block_data_order (SHA_CTX *c, const void *p,size_t num); +static void sha_block_data_order (SHA_CTX *c, const void *p,size_t num); #elif defined(SHA_1) @@ -101,7 +93,6 @@ # define HASH_TRANSFORM SHA1_Transform # define HASH_FINAL SHA1_Final # define HASH_INIT SHA1_Init -# define HASH_BLOCK_HOST_ORDER sha1_block_host_order # define HASH_BLOCK_DATA_ORDER sha1_block_data_order # if defined(__MWERKS__) && defined(__MC68K__) /* Metrowerks for Motorola fails otherwise:-( */ @@ -114,24 +105,10 @@ ) # endif -# ifdef SHA1_ASM -# if defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__INTEL__) -# if !defined(B_ENDIAN) -# define sha1_block_host_order sha1_block_asm_host_order -# define DONT_IMPLEMENT_BLOCK_HOST_ORDER -# define sha1_block_data_order sha1_block_asm_data_order -# define DONT_IMPLEMENT_BLOCK_DATA_ORDER -# define HASH_BLOCK_DATA_ORDER_ALIGNED sha1_block_asm_data_order -# endif -# elif defined(__ia64) || defined(__ia64__) || defined(_M_IA64) -# define sha1_block_host_order sha1_block_asm_host_order -# define DONT_IMPLEMENT_BLOCK_HOST_ORDER -# define sha1_block_data_order sha1_block_asm_data_order -# define DONT_IMPLEMENT_BLOCK_DATA_ORDER -# endif -# endif - void sha1_block_host_order (SHA_CTX *c, const void *p,size_t num); - void sha1_block_data_order (SHA_CTX *c, const void *p,size_t num); +#ifndef SHA1_ASM +static +#endif +void sha1_block_data_order (SHA_CTX *c, const void *p,size_t num); #else # error "Either SHA_0 or SHA_1 must be defined." @@ -236,133 +213,8 @@ int HASH_INIT (SHA_CTX *c) # define X(i) XX[i] #endif -#ifndef DONT_IMPLEMENT_BLOCK_HOST_ORDER -void HASH_BLOCK_HOST_ORDER (SHA_CTX *c, const void *d, size_t num) - { - const SHA_LONG *W=d; - register unsigned MD32_REG_T A,B,C,D,E,T; -#ifndef MD32_XARRAY - unsigned MD32_REG_T XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, - XX8, XX9,XX10,XX11,XX12,XX13,XX14,XX15; -#else - SHA_LONG XX[16]; -#endif - - A=c->h0; - B=c->h1; - C=c->h2; - D=c->h3; - E=c->h4; - - for (;;) - { - BODY_00_15( 0,A,B,C,D,E,T,W[ 0]); - BODY_00_15( 1,T,A,B,C,D,E,W[ 1]); - BODY_00_15( 2,E,T,A,B,C,D,W[ 2]); - BODY_00_15( 3,D,E,T,A,B,C,W[ 3]); - BODY_00_15( 4,C,D,E,T,A,B,W[ 4]); - BODY_00_15( 5,B,C,D,E,T,A,W[ 5]); - BODY_00_15( 6,A,B,C,D,E,T,W[ 6]); - BODY_00_15( 7,T,A,B,C,D,E,W[ 7]); - BODY_00_15( 8,E,T,A,B,C,D,W[ 8]); - BODY_00_15( 9,D,E,T,A,B,C,W[ 9]); - BODY_00_15(10,C,D,E,T,A,B,W[10]); - BODY_00_15(11,B,C,D,E,T,A,W[11]); - BODY_00_15(12,A,B,C,D,E,T,W[12]); - BODY_00_15(13,T,A,B,C,D,E,W[13]); - BODY_00_15(14,E,T,A,B,C,D,W[14]); - BODY_00_15(15,D,E,T,A,B,C,W[15]); - - BODY_16_19(16,C,D,E,T,A,B,X( 0),W[ 0],W[ 2],W[ 8],W[13]); - BODY_16_19(17,B,C,D,E,T,A,X( 1),W[ 1],W[ 3],W[ 9],W[14]); - BODY_16_19(18,A,B,C,D,E,T,X( 2),W[ 2],W[ 4],W[10],W[15]); - BODY_16_19(19,T,A,B,C,D,E,X( 3),W[ 3],W[ 5],W[11],X( 0)); - - BODY_20_31(20,E,T,A,B,C,D,X( 4),W[ 4],W[ 6],W[12],X( 1)); - BODY_20_31(21,D,E,T,A,B,C,X( 5),W[ 5],W[ 7],W[13],X( 2)); - BODY_20_31(22,C,D,E,T,A,B,X( 6),W[ 6],W[ 8],W[14],X( 3)); - BODY_20_31(23,B,C,D,E,T,A,X( 7),W[ 7],W[ 9],W[15],X( 4)); - BODY_20_31(24,A,B,C,D,E,T,X( 8),W[ 8],W[10],X( 0),X( 5)); - BODY_20_31(25,T,A,B,C,D,E,X( 9),W[ 9],W[11],X( 1),X( 6)); - BODY_20_31(26,E,T,A,B,C,D,X(10),W[10],W[12],X( 2),X( 7)); - BODY_20_31(27,D,E,T,A,B,C,X(11),W[11],W[13],X( 3),X( 8)); - BODY_20_31(28,C,D,E,T,A,B,X(12),W[12],W[14],X( 4),X( 9)); - BODY_20_31(29,B,C,D,E,T,A,X(13),W[13],W[15],X( 5),X(10)); - BODY_20_31(30,A,B,C,D,E,T,X(14),W[14],X( 0),X( 6),X(11)); - BODY_20_31(31,T,A,B,C,D,E,X(15),W[15],X( 1),X( 7),X(12)); - - BODY_32_39(32,E,T,A,B,C,D,X( 0),X( 2),X( 8),X(13)); - BODY_32_39(33,D,E,T,A,B,C,X( 1),X( 3),X( 9),X(14)); - BODY_32_39(34,C,D,E,T,A,B,X( 2),X( 4),X(10),X(15)); - BODY_32_39(35,B,C,D,E,T,A,X( 3),X( 5),X(11),X( 0)); - BODY_32_39(36,A,B,C,D,E,T,X( 4),X( 6),X(12),X( 1)); - BODY_32_39(37,T,A,B,C,D,E,X( 5),X( 7),X(13),X( 2)); - BODY_32_39(38,E,T,A,B,C,D,X( 6),X( 8),X(14),X( 3)); - BODY_32_39(39,D,E,T,A,B,C,X( 7),X( 9),X(15),X( 4)); - - BODY_40_59(40,C,D,E,T,A,B,X( 8),X(10),X( 0),X( 5)); - BODY_40_59(41,B,C,D,E,T,A,X( 9),X(11),X( 1),X( 6)); - BODY_40_59(42,A,B,C,D,E,T,X(10),X(12),X( 2),X( 7)); - BODY_40_59(43,T,A,B,C,D,E,X(11),X(13),X( 3),X( 8)); - BODY_40_59(44,E,T,A,B,C,D,X(12),X(14),X( 4),X( 9)); - BODY_40_59(45,D,E,T,A,B,C,X(13),X(15),X( 5),X(10)); - BODY_40_59(46,C,D,E,T,A,B,X(14),X( 0),X( 6),X(11)); - BODY_40_59(47,B,C,D,E,T,A,X(15),X( 1),X( 7),X(12)); - BODY_40_59(48,A,B,C,D,E,T,X( 0),X( 2),X( 8),X(13)); - BODY_40_59(49,T,A,B,C,D,E,X( 1),X( 3),X( 9),X(14)); - BODY_40_59(50,E,T,A,B,C,D,X( 2),X( 4),X(10),X(15)); - BODY_40_59(51,D,E,T,A,B,C,X( 3),X( 5),X(11),X( 0)); - BODY_40_59(52,C,D,E,T,A,B,X( 4),X( 6),X(12),X( 1)); - BODY_40_59(53,B,C,D,E,T,A,X( 5),X( 7),X(13),X( 2)); - BODY_40_59(54,A,B,C,D,E,T,X( 6),X( 8),X(14),X( 3)); - BODY_40_59(55,T,A,B,C,D,E,X( 7),X( 9),X(15),X( 4)); - BODY_40_59(56,E,T,A,B,C,D,X( 8),X(10),X( 0),X( 5)); - BODY_40_59(57,D,E,T,A,B,C,X( 9),X(11),X( 1),X( 6)); - BODY_40_59(58,C,D,E,T,A,B,X(10),X(12),X( 2),X( 7)); - BODY_40_59(59,B,C,D,E,T,A,X(11),X(13),X( 3),X( 8)); - - BODY_60_79(60,A,B,C,D,E,T,X(12),X(14),X( 4),X( 9)); - BODY_60_79(61,T,A,B,C,D,E,X(13),X(15),X( 5),X(10)); - BODY_60_79(62,E,T,A,B,C,D,X(14),X( 0),X( 6),X(11)); - BODY_60_79(63,D,E,T,A,B,C,X(15),X( 1),X( 7),X(12)); - BODY_60_79(64,C,D,E,T,A,B,X( 0),X( 2),X( 8),X(13)); - BODY_60_79(65,B,C,D,E,T,A,X( 1),X( 3),X( 9),X(14)); - BODY_60_79(66,A,B,C,D,E,T,X( 2),X( 4),X(10),X(15)); - BODY_60_79(67,T,A,B,C,D,E,X( 3),X( 5),X(11),X( 0)); - BODY_60_79(68,E,T,A,B,C,D,X( 4),X( 6),X(12),X( 1)); - BODY_60_79(69,D,E,T,A,B,C,X( 5),X( 7),X(13),X( 2)); - BODY_60_79(70,C,D,E,T,A,B,X( 6),X( 8),X(14),X( 3)); - BODY_60_79(71,B,C,D,E,T,A,X( 7),X( 9),X(15),X( 4)); - BODY_60_79(72,A,B,C,D,E,T,X( 8),X(10),X( 0),X( 5)); - BODY_60_79(73,T,A,B,C,D,E,X( 9),X(11),X( 1),X( 6)); - BODY_60_79(74,E,T,A,B,C,D,X(10),X(12),X( 2),X( 7)); - BODY_60_79(75,D,E,T,A,B,C,X(11),X(13),X( 3),X( 8)); - BODY_60_79(76,C,D,E,T,A,B,X(12),X(14),X( 4),X( 9)); - BODY_60_79(77,B,C,D,E,T,A,X(13),X(15),X( 5),X(10)); - BODY_60_79(78,A,B,C,D,E,T,X(14),X( 0),X( 6),X(11)); - BODY_60_79(79,T,A,B,C,D,E,X(15),X( 1),X( 7),X(12)); - - c->h0=(c->h0+E)&0xffffffffL; - c->h1=(c->h1+T)&0xffffffffL; - c->h2=(c->h2+A)&0xffffffffL; - c->h3=(c->h3+B)&0xffffffffL; - c->h4=(c->h4+C)&0xffffffffL; - - if (--num == 0) break; - - A=c->h0; - B=c->h1; - C=c->h2; - D=c->h3; - E=c->h4; - - W+=SHA_LBLOCK; - } - } -#endif - -#ifndef DONT_IMPLEMENT_BLOCK_DATA_ORDER -void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num) +#if !defined(SHA_1) || !defined(SHA1_ASM) +static void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num) { const unsigned char *data=p; register unsigned MD32_REG_T A,B,C,D,E,T,l; @@ -380,25 +232,53 @@ void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num) E=c->h4; for (;;) - { + { + const union { long one; char little; } is_endian = {1}; - HOST_c2l(data,l); X( 0)=l; HOST_c2l(data,l); X( 1)=l; - BODY_00_15( 0,A,B,C,D,E,T,X( 0)); HOST_c2l(data,l); X( 2)=l; - BODY_00_15( 1,T,A,B,C,D,E,X( 1)); HOST_c2l(data,l); X( 3)=l; - BODY_00_15( 2,E,T,A,B,C,D,X( 2)); HOST_c2l(data,l); X( 4)=l; - BODY_00_15( 3,D,E,T,A,B,C,X( 3)); HOST_c2l(data,l); X( 5)=l; - BODY_00_15( 4,C,D,E,T,A,B,X( 4)); HOST_c2l(data,l); X( 6)=l; - BODY_00_15( 5,B,C,D,E,T,A,X( 5)); HOST_c2l(data,l); X( 7)=l; - BODY_00_15( 6,A,B,C,D,E,T,X( 6)); HOST_c2l(data,l); X( 8)=l; - BODY_00_15( 7,T,A,B,C,D,E,X( 7)); HOST_c2l(data,l); X( 9)=l; - BODY_00_15( 8,E,T,A,B,C,D,X( 8)); HOST_c2l(data,l); X(10)=l; - BODY_00_15( 9,D,E,T,A,B,C,X( 9)); HOST_c2l(data,l); X(11)=l; - BODY_00_15(10,C,D,E,T,A,B,X(10)); HOST_c2l(data,l); X(12)=l; - BODY_00_15(11,B,C,D,E,T,A,X(11)); HOST_c2l(data,l); X(13)=l; - BODY_00_15(12,A,B,C,D,E,T,X(12)); HOST_c2l(data,l); X(14)=l; - BODY_00_15(13,T,A,B,C,D,E,X(13)); HOST_c2l(data,l); X(15)=l; - BODY_00_15(14,E,T,A,B,C,D,X(14)); - BODY_00_15(15,D,E,T,A,B,C,X(15)); + if (!is_endian.little && sizeof(SHA_LONG)==4 && ((size_t)p%4)==0) + { + const SHA_LONG *W=(const SHA_LONG *)data; + + X( 0) = W[0]; X( 1) = W[ 1]; + BODY_00_15( 0,A,B,C,D,E,T,X( 0)); X( 2) = W[ 2]; + BODY_00_15( 1,T,A,B,C,D,E,X( 1)); X( 3) = W[ 3]; + BODY_00_15( 2,E,T,A,B,C,D,X( 2)); X( 4) = W[ 4]; + BODY_00_15( 3,D,E,T,A,B,C,X( 3)); X( 5) = W[ 5]; + BODY_00_15( 4,C,D,E,T,A,B,X( 4)); X( 6) = W[ 6]; + BODY_00_15( 5,B,C,D,E,T,A,X( 5)); X( 7) = W[ 7]; + BODY_00_15( 6,A,B,C,D,E,T,X( 6)); X( 8) = W[ 8]; + BODY_00_15( 7,T,A,B,C,D,E,X( 7)); X( 9) = W[ 9]; + BODY_00_15( 8,E,T,A,B,C,D,X( 8)); X(10) = W[10]; + BODY_00_15( 9,D,E,T,A,B,C,X( 9)); X(11) = W[11]; + BODY_00_15(10,C,D,E,T,A,B,X(10)); X(12) = W[12]; + BODY_00_15(11,B,C,D,E,T,A,X(11)); X(13) = W[13]; + BODY_00_15(12,A,B,C,D,E,T,X(12)); X(14) = W[14]; + BODY_00_15(13,T,A,B,C,D,E,X(13)); X(15) = W[15]; + BODY_00_15(14,E,T,A,B,C,D,X(14)); + BODY_00_15(15,D,E,T,A,B,C,X(15)); + + data += SHA_CBLOCK; + } + else + { + HOST_c2l(data,l); X( 0)=l; HOST_c2l(data,l); X( 1)=l; + BODY_00_15( 0,A,B,C,D,E,T,X( 0)); HOST_c2l(data,l); X( 2)=l; + BODY_00_15( 1,T,A,B,C,D,E,X( 1)); HOST_c2l(data,l); X( 3)=l; + BODY_00_15( 2,E,T,A,B,C,D,X( 2)); HOST_c2l(data,l); X( 4)=l; + BODY_00_15( 3,D,E,T,A,B,C,X( 3)); HOST_c2l(data,l); X( 5)=l; + BODY_00_15( 4,C,D,E,T,A,B,X( 4)); HOST_c2l(data,l); X( 6)=l; + BODY_00_15( 5,B,C,D,E,T,A,X( 5)); HOST_c2l(data,l); X( 7)=l; + BODY_00_15( 6,A,B,C,D,E,T,X( 6)); HOST_c2l(data,l); X( 8)=l; + BODY_00_15( 7,T,A,B,C,D,E,X( 7)); HOST_c2l(data,l); X( 9)=l; + BODY_00_15( 8,E,T,A,B,C,D,X( 8)); HOST_c2l(data,l); X(10)=l; + BODY_00_15( 9,D,E,T,A,B,C,X( 9)); HOST_c2l(data,l); X(11)=l; + BODY_00_15(10,C,D,E,T,A,B,X(10)); HOST_c2l(data,l); X(12)=l; + BODY_00_15(11,B,C,D,E,T,A,X(11)); HOST_c2l(data,l); X(13)=l; + BODY_00_15(12,A,B,C,D,E,T,X(12)); HOST_c2l(data,l); X(14)=l; + BODY_00_15(13,T,A,B,C,D,E,X(13)); HOST_c2l(data,l); X(15)=l; + BODY_00_15(14,E,T,A,B,C,D,X(14)); + BODY_00_15(15,D,E,T,A,B,C,X(15)); + } BODY_16_19(16,C,D,E,T,A,B,X( 0),X( 0),X( 2),X( 8),X(13)); BODY_16_19(17,B,C,D,E,T,A,X( 1),X( 1),X( 3),X( 9),X(14)); @@ -483,7 +363,7 @@ void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num) D=c->h3; E=c->h4; - } + } } #endif @@ -518,54 +398,8 @@ void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num) E=D, D=C, C=ROTATE(B,30), B=A; \ A=ROTATE(A,5)+T+xa; } while(0) -#ifndef DONT_IMPLEMENT_BLOCK_HOST_ORDER -void HASH_BLOCK_HOST_ORDER (SHA_CTX *c, const void *d, size_t num) - { - const SHA_LONG *W=d; - register unsigned MD32_REG_T A,B,C,D,E,T; - int i; - SHA_LONG X[16]; - - A=c->h0; - B=c->h1; - C=c->h2; - D=c->h3; - E=c->h4; - - for (;;) - { - for (i=0;i<16;i++) - { X[i]=W[i]; BODY_00_15(X[i]); } - for (i=0;i<4;i++) - { BODY_16_19(X[i], X[i+2], X[i+8], X[(i+13)&15]); } - for (;i<24;i++) - { BODY_20_39(X[i&15], X[(i+2)&15], X[(i+8)&15],X[(i+13)&15]); } - for (i=0;i<20;i++) - { BODY_40_59(X[(i+8)&15],X[(i+10)&15],X[i&15], X[(i+5)&15]); } - for (i=4;i<24;i++) - { BODY_60_79(X[(i+8)&15],X[(i+10)&15],X[i&15], X[(i+5)&15]); } - - c->h0=(c->h0+A)&0xffffffffL; - c->h1=(c->h1+B)&0xffffffffL; - c->h2=(c->h2+C)&0xffffffffL; - c->h3=(c->h3+D)&0xffffffffL; - c->h4=(c->h4+E)&0xffffffffL; - - if (--num == 0) break; - - A=c->h0; - B=c->h1; - C=c->h2; - D=c->h3; - E=c->h4; - - W+=SHA_LBLOCK; - } - } -#endif - -#ifndef DONT_IMPLEMENT_BLOCK_DATA_ORDER -void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num) +#if !defined(SHA_1) || !defined(SHA1_ASM) +static void HASH_BLOCK_DATA_ORDER (SHA_CTX *c, const void *p, size_t num) { const unsigned char *data=p; register unsigned MD32_REG_T A,B,C,D,E,T,l; -- 2.25.1