From bd3576d2ddedb0492f5bd3c1e47c15778e4fbe3c Mon Sep 17 00:00:00 2001 From: =?utf8?q?Ulf=20M=C3=B6ller?= Date: Thu, 13 May 1999 13:16:42 +0000 Subject: [PATCH] Reorganize and speed up MD5. Submitted by: Andy Polyakov --- CHANGES | 3 + Configure | 7 +- crypto/md32_common.h | 592 +++++++++++++++++++ crypto/md5/Makefile.ssl | 10 +- crypto/md5/asm/md5-586.pl | 3 +- crypto/md5/asm/md5-sparcv9.S | 1035 ++++++++++++++++++++++++++++++++++ crypto/md5/md5.h | 36 +- crypto/md5/md5_dgst.c | 365 +++++------- crypto/md5/md5_locl.h | 153 +++-- crypto/md5/md5_one.c | 3 +- 10 files changed, 1867 insertions(+), 340 deletions(-) create mode 100644 crypto/md32_common.h create mode 100644 crypto/md5/asm/md5-sparcv9.S diff --git a/CHANGES b/CHANGES index 770bd1982b..a3a90ddc39 100644 --- a/CHANGES +++ b/CHANGES @@ -5,6 +5,9 @@ Changes between 0.9.2b and 0.9.3 + *) Reorganize and speed up MD5. + [Andy Polyakov ] + *) VMS support. [Richard Levitte ] diff --git a/Configure b/Configure index b7f910f284..948da07b9f 100755 --- a/Configure +++ b/Configure @@ -106,7 +106,7 @@ my %table=( # Solaris setups "solaris-x86-gcc","gcc:-O3 -fomit-frame-pointer -m486 -Wall -DL_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG $x86_gcc_des $x86_gcc_opts:$x86_sol_asm", "solaris-sparc-gcc","gcc:-O3 -fomit-frame-pointer -mv8 -Wall -DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:asm/sparcv8.o::", -"solaris-usparc-gcc","gcc:-O3 -fomit-frame-pointer -mcpu=ultrasparc -Wall -DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:asm/sparcv8plus-gcc.o::", +"solaris-usparc-gcc","gcc:-O3 -fomit-frame-pointer -mcpu=ultrasparc -Wall -DB_ENDIAN -DULTRASPARC:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:asm/sparcv8plus-gcc.o:::asm/md5-sparcv8plus.o:", "debug-solaris-sparc-gcc","gcc:-O3 -g -mv8 -Wall -DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:::", "debug-solaris-usparc-gcc","gcc:-O3 -g -mcpu=ultrasparc -Wall -DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:asm/sparcv8plus-gcc.o::", @@ -115,12 +115,11 @@ my %table=( # SC4 is ok, better than gcc even on bn as long as you tell it -xarch=v8 # -fast slows things like DES down quite a lot # Don't use -xtarget=ultra with SC4.2. It is broken, and will break exptest. -# SC5.0 with the compiler common patch works. "solaris-sparc-sc4","cc:-xarch=v8 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8.o::", "solaris-usparc-sc4","cc:-xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o::", # SC5.0 note: Compiler common patch 107357-01 or later is required! -"solaris-usparc-sc5","cc:-xtarget=ultra -xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o::", -"solaris64-usparc-sc5","cc:-xtarget=ultra -xarch=v9 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:SIXTY_FOUR_BIT_LONG RC4_CHAR DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR:::", +"solaris-usparc-sc5","cc:-xtarget=ultra -xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DULTRASPARC -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o:::asm/md5-sparcv8plus.o:", +"solaris64-usparc-sc5","cc:-xtarget=ultra -xarch=v9 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DULTRASPARC:-D_REENTRANT:-lsocket -lnsl:SIXTY_FOUR_BIT_LONG RC4_CHAR DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::::asm/md5-sparcv9.o:", # Sunos configs, assuming sparc for the gcc one. ##"sunos-cc", "cc:-O4 -DNOPROTO -DNOCONST:(unknown)::DES_UNROLL:::", diff --git a/crypto/md32_common.h b/crypto/md32_common.h new file mode 100644 index 0000000000..977ea8e73b --- /dev/null +++ b/crypto/md32_common.h @@ -0,0 +1,592 @@ +/* crypto/md32_common.h */ +/* ==================================================================== + * Copyright (c) 1999 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * licensing@OpenSSL.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ + +/* + * This is a generic 32 bit "collector" for message digest algorithms. + * Whenever needed it collects input character stream into chunks of + * 32 bit values and invokes a block function that performs actual hash + * calculations. + * + * Porting guide. + * + * Obligatory macros: + * + * DATA_ORDER_IS_BIG_ENDIAN or DATA_ORDER_IS_LITTLE_ENDIAN + * this macro defines byte order of input stream. + * HASH_CBLOCK + * size of a unit chunk HASH_BLOCK operates on. + * HASH_LONG + * has to be at lest 32 bit wide, if it's wider, then + * HASH_LONG_LOG2 *has to* be defined along + * HASH_CTX + * context structure that at least contains following + * members: + * typedef struct { + * ... + * HASH_LONG Nl,Nh; + * HASH_LONG data[HASH_LBLOCK]; + * int num; + * ... + * } HASH_CTX; + * HASH_UPDATE + * name of "Update" function, implemented here. + * HASH_TRANSFORM + * name of "Transform" function, implemented here. + * HASH_FINAL + * name of "Final" function, implemented here. + * HASH_BLOCK_HOST_ORDER + * name of "block" function treating *aligned* input message + * in host byte order, implemented externally. + * HASH_BLOCK_DATA_ORDER + * name of "block" function treating *unaligned* input message + * in original (data) byte order, implemented externally (it + * actually is optional if data and host are of the same + * "endianess"). + * + * Optional macros: + * + * B_ENDIAN or L_ENDIAN + * defines host byte-order. + * HASH_LONG_LOG2 + * defaults to 2 if not states otherwise. + * HASH_LBLOCK + * assumed to be HASH_CBLOCK/4 if not stated otherwise. + * HASH_BLOCK_DATA_ORDER_ALIGNED + * alternative "block" function capable of treating + * aligned input message in original (data) order, + * implemented externally. + * + * MD5 example: + * + * #define DATA_ORDER_IS_LITTLE_ENDIAN + * + * #define HASH_LONG MD5_LONG + * #define HASH_LONG_LOG2 MD5_LONG_LOG2 + * #define HASH_CTX MD5_CTX + * #define HASH_CBLOCK MD5_CBLOCK + * #define HASH_LBLOCK MD5_LBLOCK + * #define HASH_UPDATE MD5_Update + * #define HASH_TRANSFORM MD5_Transform + * #define HASH_FINAL MD5_Final + * #define HASH_BLOCK_HOST_ORDER md5_block_host_order + * #define HASH_BLOCK_DATA_ORDER md5_block_data_order + * + * + */ + +#if !defined(DATA_ORDER_IS_BIG_ENDIAN) && !defined(DATA_ORDER_IS_LITTLE_ENDIAN) +#error "DATA_ORDER must be defined!" +#endif + +#ifndef HASH_CBLOCK +#error "HASH_CBLOCK must be defined!" +#endif +#ifndef HASH_LONG +#error "HASH_LONG must be defined!" +#endif +#ifndef HASH_CTX +#error "HASH_CTX must be defined!" +#endif + +#ifndef HASH_UPDATE +#error "HASH_UPDATE must be defined!" +#endif +#ifndef HASH_TRANSFORM +#error "HASH_TRANSFORM must be defined!" +#endif +#ifndef HASH_FINAL +#error "HASH_FINAL must be defined!" +#endif + +#ifndef HASH_BLOCK_HOST_ORDER +#error "HASH_BLOCK_HOST_ORDER must be defined!" +#endif + +#if 0 +/* + * Moved below as it's required only if HASH_BLOCK_DATA_ORDER_ALIGNED + * isn't defined. + */ +#ifndef HASH_BLOCK_DATA_ORDER +#error "HASH_BLOCK_DATA_ORDER must be defined!" +#endif +#endif + +#ifndef HASH_LBLOCK +#define HASH_LBLOCK (HASH_CBLOCK/4) +#endif + +#ifndef HASH_LONG_LOG2 +#define HASH_LONG_LOG2 2 +#endif + +/* + * Engage compiler specific rotate intrinsic function if available. + */ +#undef ROTATE +#ifndef PEDANTIC +# if defined(_MSC_VER) +# define ROTATE(a,n) _lrotl(a,n) +# elif defined(__GNUC__) && __GNUC__>=2 + /* + * Some GNU C inline assembler templates. Note that these are + * rotates by *constant* number of bits! But that's exactly + * what we need here... + * + * + */ +# if defined(__i386) +# define ROTATE(a,n) ({ register unsigned int ret; \ + asm volatile ( \ + "roll %1,%0" \ + : "=r"(ret) \ + : "I"(n), "0"(a) \ + : "cc"); \ + ret; \ + }) +# elif defined(__powerpc) +# define ROTATE(a,n) ({ register unsigned int ret; \ + asm volatile ( \ + "rlwinm %0,%1,%2,0,31" \ + : "=r"(ret) \ + : "r"(a), "I"(n)); \ + ret; \ + }) +# endif +# endif + +/* + * Engage compiler specific "fetch in reverse byte order" + * intrinsic function if available. + */ +# if defined(__GNUC__) && __GNUC__>=2 + /* some GNU C inline assembler templates by */ +# if defined(__i386) && !defined(I386_ONLY) +# define BE_FETCH32(a) ({ register unsigned int l=(a);\ + asm volatile ( \ + "bswapl %0" \ + : "=r"(l) : "0"(l)); \ + l; \ + }) +# elif defined(__powerpc) +# define LE_FETCH32(a) ({ register unsigned int l; \ + asm volatile ( \ + "lwbrx %0,0,%1" \ + : "=r"(l) \ + : "r"(a)); \ + l; \ + }) + +# elif defined(__sparc) && defined(ULTRASPARC) +# define LE_FETCH32(a) ({ register unsigned int l; \ + asm volatile ( \ + "lda [%1]#ASI_PRIMARY_LITTLE,%0"\ + : "=r"(l) \ + : "r"(a)); \ + l; \ + }) +# endif +# endif +#endif /* PEDANTIC */ + +#if HASH_LONG_LOG2==2 /* Engage only if sizeof(HASH_LONG)== 4 */ +/* A nice byte order reversal from Wei Dai */ +#ifdef ROTATE +/* 5 instructions with rotate instruction, else 9 */ +#define REVERSE_FETCH32(a,l) ( \ + l=*(const HASH_LONG *)(a), \ + ((ROTATE(l,8)&0x00FF00FF)|(ROTATE((l&0x00FF00FF),24))) \ + ) +#else +/* 6 instructions with rotate instruction, else 8 */ +#define REVERSE_FETCH32(a,l) ( \ + l=*(const HASH_LONG *)(a), \ + l=(((l>>8)&0x00FF00FF)|((l&0x00FF00FF)<<8)), \ + ROTATE(l,16) \ + ) +/* + * Originally the middle line started with l=(((l&0xFF00FF00)>>8)|... + * It's rewritten as above for two reasons: + * - RISCs aren't good at long constants and have to explicitely + * compose 'em with several (well, usually 2) instructions in a + * register before performing the actual operation and (as you + * already realized:-) having same constant should inspire the + * compiler to permanently allocate the only register for it; + * - most modern CPUs have two ALUs, but usually only one has + * circuitry for shifts:-( this minor tweak inspires compiler + * to schedule shift instructions in a better way... + * + * + */ +#endif +#endif + +#ifndef ROTATE +#define ROTATE(a,n) (((a)<<(n))|(((a)&0xffffffff)>>(32-(n)))) +#endif + +/* + * Make some obvious choices. E.g., HASH_BLOCK_DATA_ORDER_ALIGNED + * and HASH_BLOCK_HOST_ORDER ought to be the same if input data + * and host are of the same "endianess". It's possible to mask + * this with blank #define HASH_BLOCK_DATA_ORDER though... + * + * + */ +#if defined(B_ENDIAN) +# if defined(DATA_ORDER_IS_BIG_ENDIAN) +# if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_LONG_LOG2==2 +# define HASH_BLOCK_DATA_ORDER_ALIGNED HASH_BLOCK_HOST_ORDER +# endif +# elif defined(DATA_ORDER_IS_LITTLE_ENDIAN) +# ifndef HOST_FETCH32 +# ifdef LE_FETCH32 +# define HOST_FETCH32(p,l) LE_FETCH32(p) +# elif defined(REVERSE_FETCH32) +# define HOST_FETCH32(p,l) REVERSE_FETCH32(p,l) +# endif +# endif +# endif +#elif defined(L_ENDIAN) +# if defined(DATA_ORDER_IS_LITTLE_ENDIAN) +# if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_LONG_LOG2==2 +# define HASH_BLOCK_DATA_ORDER_ALIGNED HASH_BLOCK_HOST_ORDER +# endif +# elif defined(DATA_ORDER_IS_BIG_ENDIAN) +# ifndef HOST_FETCH32 +# ifdef BE_FETCH32 +# define HOST_FETCH32(p,l) BE_FETCH32(p) +# elif defined(REVERSE_FETCH32) +# define HOST_FETCH32(p,l) REVERSE_FETCH32(p,l) +# endif +# endif +# endif +#endif + +#if !defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_BLOCK_DATA_ORDER_ALIGNED!=1 +#ifndef HASH_BLOCK_DATA_ORDER +#error "HASH_BLOCK_DATA_ORDER must be defined!" +#endif +#endif + +#if defined(DATA_ORDER_IS_BIG_ENDIAN) + +#define HOST_c2l(c,l) (l =(((unsigned long)(*((c)++)))<<24), \ + l|=(((unsigned long)(*((c)++)))<<16), \ + l|=(((unsigned long)(*((c)++)))<< 8), \ + l|=(((unsigned long)(*((c)++))) ), \ + l) +#define HOST_p_c2l(c,l,n) { \ + switch (n) { \ + case 0: l =((unsigned long)(*((c)++)))<<24; \ + case 1: l|=((unsigned long)(*((c)++)))<<16; \ + case 2: l|=((unsigned long)(*((c)++)))<< 8; \ + case 3: l|=((unsigned long)(*((c)++))); \ + } } +#define HOST_p_c2l_p(c,l,sc,len) { \ + switch (sc) { \ + case 0: l =((unsigned long)(*((c)++)))<<24; \ + if (--len == 0) break; \ + case 1: l|=((unsigned long)(*((c)++)))<<16; \ + if (--len == 0) break; \ + case 2: l|=((unsigned long)(*((c)++)))<< 8; \ + } } +/* NOTE the pointer is not incremented at the end of this */ +#define HOST_c2l_p(c,l,n) { \ + l=0; (c)+=n; \ + switch (n) { \ + case 3: l =((unsigned long)(*(--(c))))<< 8; \ + case 2: l|=((unsigned long)(*(--(c))))<<16; \ + case 1: l|=((unsigned long)(*(--(c))))<<24; \ + } } +#define HOST_l2c(l,c) (*((c)++)=(unsigned char)(((l)>>24)&0xff), \ + *((c)++)=(unsigned char)(((l)>>16)&0xff), \ + *((c)++)=(unsigned char)(((l)>> 8)&0xff), \ + *((c)++)=(unsigned char)(((l) )&0xff), \ + l) + +#elif defined(DATA_ORDER_IS_LITTLE_ENDIAN) + +#define HOST_c2l(c,l) (l =(((unsigned long)(*((c)++))) ), \ + l|=(((unsigned long)(*((c)++)))<< 8), \ + l|=(((unsigned long)(*((c)++)))<<16), \ + l|=(((unsigned long)(*((c)++)))<<24), \ + l) +#define HOST_p_c2l(c,l,n) { \ + switch (n) { \ + case 0: l =((unsigned long)(*((c)++))); \ + case 1: l|=((unsigned long)(*((c)++)))<< 8; \ + case 2: l|=((unsigned long)(*((c)++)))<<16; \ + case 3: l|=((unsigned long)(*((c)++)))<<24; \ + } } +#define HOST_p_c2l_p(c,l,sc,len) { \ + switch (sc) { \ + case 0: l =((unsigned long)(*((c)++))); \ + if (--len == 0) break; \ + case 1: l|=((unsigned long)(*((c)++)))<< 8; \ + if (--len == 0) break; \ + case 2: l|=((unsigned long)(*((c)++)))<<16; \ + } } +/* NOTE the pointer is not incremented at the end of this */ +#define HOST_c2l_p(c,l,n) { \ + l=0; (c)+=n; \ + switch (n) { \ + case 3: l =((unsigned long)(*(--(c))))<<16; \ + case 2: l|=((unsigned long)(*(--(c))))<< 8; \ + case 1: l|=((unsigned long)(*(--(c)))); \ + } } +#define HOST_l2c(l,c) (*((c)++)=(unsigned char)(((l) )&0xff), \ + *((c)++)=(unsigned char)(((l)>> 8)&0xff), \ + *((c)++)=(unsigned char)(((l)>>16)&0xff), \ + *((c)++)=(unsigned char)(((l)>>24)&0xff), \ + l) + +#endif + +/* + * Time for some action:-) + */ + +void HASH_UPDATE (HASH_CTX *c, const unsigned char *data, unsigned long len) + { + register HASH_LONG * p; + register unsigned long l; + int sw,sc,ew,ec; + + if (len==0) return; + + l=(c->Nl+(len<<3))&0xffffffffL; + /* 95-05-24 eay Fixed a bug with the overflow handling, thanks to + * Wei Dai for pointing it out. */ + if (l < c->Nl) /* overflow */ + c->Nh++; + c->Nh+=(len>>29); + c->Nl=l; + + if (c->num != 0) + { + p=c->data; + sw=c->num>>2; + sc=c->num&0x03; + + if ((c->num+len) >= HASH_CBLOCK) + { + l=p[sw]; HOST_p_c2l(data,l,sc); p[sw++]=l; + for (; swnum); + c->num=0; + /* drop through and do the rest */ + } + else + { + c->num+=len; + if ((sc+len) < 4) /* ugly, add char's to a word */ + { + l=p[sw]; HOST_p_c2l_p(data,l,sc,len); p[sw]=l; + } + else + { + ew=(c->num>>2); + ec=(c->num&0x03); + l=p[sw]; HOST_p_c2l(data,l,sc); p[sw++]=l; + for (; sw < ew; sw++) + { + HOST_c2l(data,l); p[sw]=l; + } + if (ec) + { + HOST_c2l_p(data,l,ec); p[sw]=l; + } + } + return; + } + } + + sw=len/HASH_CBLOCK; + if (sw > 0) + { +#if defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_BLOCK_DATA_ORDER_ALIGNED!=1 + /* + * Note that HASH_BLOCK_DATA_ORDER_ALIGNED gets defined + * only if sizeof(HASH_LONG)==4. + */ + if ((((unsigned long)data)%4) == 0) + { + HASH_BLOCK_DATA_ORDER_ALIGNED (c,(HASH_LONG *)data,sw); + sw*=HASH_CBLOCK; + data+=sw; + len-=sw; + } + else +#if !defined(HASH_BLOCK_DATA_ORDER) + while (sw--) + { + memcpy (p=c->data,data,HASH_CBLOCK); + HASH_BLOCK_DATA_ORDER_ALIGNED(c,p,1); + data+=HASH_CBLOCK; + len-=HASH_CBLOCK; + } +#endif +#endif +#if defined(HASH_BLOCK_DATA_ORDER) + { + HASH_BLOCK_DATA_ORDER (c,(HASH_LONG *)data,sw); + sw*=HASH_CBLOCK; + data+=sw; + len-=sw; + } +#endif + } + + if (len!=0) + { + p = c->data; + c->num = len; + ew=len>>2; /* words to copy */ + ec=len&0x03; + for (; ew; ew--,p++) + { + HOST_c2l(data,l); *p=l; + } + HOST_c2l_p(data,l,ec); + *p=l; + } + } + + +void HASH_TRANSFORM (HASH_CTX *c, unsigned char *data) + { +#if defined(HASH_BLOCK_DATA_ORDER_ALIGNED) && HASH_BLOCK_DATA_ORDER_ALIGNED!=1 + if ((((unsigned long)data)%4) == 0) + HASH_BLOCK_DATA_ORDER_ALIGNED (c,(HASH_LONG *)data,1); + else +#if !defined(HASH_BLOCK_DATA_ORDER) + { + memcpy (c->data,data,HASH_CBLOCK); + HASH_BLOCK_DATA_ORDER_ALIGNED (c,c->data,1); + } +#endif +#endif +#if defined(HASH_BLOCK_DATA_ORDER) + HASH_BLOCK_DATA_ORDER (c,(HASH_LONG *)data,1); +#endif + } + + +void HASH_FINAL (unsigned char *md, HASH_CTX *c) + { + register HASH_LONG *p; + register unsigned long l; + register int i,j; + static const unsigned char end[4]={0x80,0x00,0x00,0x00}; + const unsigned char *cp=end; + + /* c->num should definitly have room for at least one more byte. */ + p=c->data; + i=c->num>>2; + j=c->num&0x03; + +#if 0 + /* purify often complains about the following line as an + * Uninitialized Memory Read. While this can be true, the + * following p_c2l macro will reset l when that case is true. + * This is because j&0x03 contains the number of 'valid' bytes + * already in p[i]. If and only if j&0x03 == 0, the UMR will + * occur but this is also the only time p_c2l will do + * l= *(cp++) instead of l|= *(cp++) + * Many thanks to Alex Tang for pickup this + * 'potential bug' */ +#ifdef PURIFY + if (j==0) p[i]=0; /* Yeah, but that's not the way to fix it:-) */ +#endif + l=p[i]; +#else + l = (j==0) ? 0 : p[i]; +#endif + HOST_p_c2l(cp,l,j); p[i++]=l; /* i is the next 'undefined word' */ + + if (i>(HASH_LBLOCK-2)) /* save room for Nl and Nh */ + { + if (iNh; + p[HASH_LBLOCK-1]=c->Nl; +#elif defined(DATA_ORDER_IS_LITTLE_ENDIAN) + p[HASH_LBLOCK-2]=c->Nl; + p[HASH_LBLOCK-1]=c->Nh; +#endif + HASH_BLOCK_HOST_ORDER (c,p,1); + + l=c->A; HOST_l2c(l,md); + l=c->B; HOST_l2c(l,md); + l=c->C; HOST_l2c(l,md); + l=c->D; HOST_l2c(l,md); + + c->num=0; + /* clear stuff, HASH_BLOCK may be leaving some stuff on the stack + * but I'm not worried :-) + memset((void *)c,0,sizeof(HASH_CTX)); + */ + } diff --git a/crypto/md5/Makefile.ssl b/crypto/md5/Makefile.ssl index e27cfca0b0..f8eaf628b3 100644 --- a/crypto/md5/Makefile.ssl +++ b/crypto/md5/Makefile.ssl @@ -66,6 +66,14 @@ asm/mx86bsdi.o: asm/mx86unix.cpp asm/mx86unix.cpp: asm/md5-586.pl (cd asm; $(PERL) md5-586.pl cpp >mx86unix.cpp) +# works for both SC and gcc +asm/md5-sparcv8plus.o: asm/md5-sparcv9.S + $(CPP) -DULTRASPARC -DMD5_BLOCK_DATA_ORDER asm/md5-sparcv9.S | as -xarch=v8plus /dev/fd/0 -o asm/md5-sparcv8plus.o + +asm/md5-sparcv9.o: asm/md5-sparcv9.S + $(CC) -xarch=v9 -DULTRASPARC -DMD5_BLOCK_DATA_ORDER -c asm/md5-sparcv9.S -o asm/md5-sparcv9.o + + files: $(PERL) $(TOP)/util/files.pl Makefile.ssl >> $(TOP)/MINFO @@ -103,5 +111,5 @@ clean: # DO NOT DELETE THIS LINE -- make depend depends on it. md5_dgst.o: ../../include/openssl/md5.h ../../include/openssl/opensslv.h -md5_dgst.o: md5_locl.h +md5_dgst.o: ../md32_common.h md5_locl.h md5_one.o: ../../include/openssl/md5.h md5_locl.h diff --git a/crypto/md5/asm/md5-586.pl b/crypto/md5/asm/md5-586.pl index 0249e100e1..5fc6a205ce 100644 --- a/crypto/md5/asm/md5-586.pl +++ b/crypto/md5/asm/md5-586.pl @@ -29,7 +29,7 @@ $X="esi"; 0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9, # R3 ); -&md5_block("md5_block_x86"); +&md5_block("md5_block_asm_host_order"); &asm_finish(); sub Np @@ -183,6 +183,7 @@ sub md5_block &mov($X, &wparam(1)); # esi &mov($C, &wparam(2)); &push("ebp"); + &shl($C, 6); &push("ebx"); &add($C, $X); # offset we end at &sub($C, 64); diff --git a/crypto/md5/asm/md5-sparcv9.S b/crypto/md5/asm/md5-sparcv9.S new file mode 100644 index 0000000000..955b3680b5 --- /dev/null +++ b/crypto/md5/asm/md5-sparcv9.S @@ -0,0 +1,1035 @@ +.ident "md5-sparcv9.S, Version 1.0" +.ident "SPARC V9 ISA artwork by Andy Polyakov " +.file "md5-sparcv9.S" + +/* + * ==================================================================== + * Copyright (c) 1999 Andy Polyakov . + * + * Rights for redistribution and usage in source and binary forms are + * granted as long as above copyright notices are retained. Warranty + * of any kind is (of course:-) disclaimed. + * ==================================================================== + */ + +/* + * This is my modest contribution to OpenSSL project (see + * http://www.openssl.org/ for more information about it) and is an + * assembler implementation of MD5 block hash function. I've hand-coded + * this for the sole reason to reach UltraSPARC-specific "load in + * little-endian byte order" instruction. This gives up to 15% + * performance improvement for cases when input message is aligned at + * 32 bits boundary. The module was tested under both 32 *and* 64 bit + * kernels. For updates see http://fy.chalmers.se/~appro/hpe/. + * + * To compile with SC4.x/SC5.x: + * + * cc -xarch=v[9|8plus] -DULTRASPARC -DMD5_BLOCK_DATA_ORDER \ + * -c md5-sparcv9.S + * + * and with gcc: + * + * gcc -mcpu=ultrasparc -DULTRASPARC -DMD5_BLOCK_DATA_ORDER \ + * -c md5-sparcv9.S + * + * or if above fails (it does if you have gas): + * + * gcc -E -DULTRASPARC -DMD5_BLOCK_DATA_ORDER md5_block.sparc.S | \ + * as -xarch=v8plus /dev/fd/0 -o md5-sparcv9.o + */ + +#define A %o0 +#define B %o1 +#define C %o2 +#define D %o3 +#define T1 %o4 +#define T2 %o5 + +#define R0 %l0 +#define R1 %l1 +#define R2 %l2 +#define R3 %l3 +#define R4 %l4 +#define R5 %l5 +#define R6 %l6 +#define R7 %l7 +#define R8 %i3 +#define R9 %i4 +#define R10 %i5 +#define R11 %g1 +#define R12 %g2 +#define R13 %g3 +#define RX %g4 + +#define Aptr %i0+0 +#define Bptr %i0+4 +#define Cptr %i0+8 +#define Dptr %i0+12 + +#define Aval R5 /* those not used at the end of the last round */ +#define Bval R6 +#define Cval R7 +#define Dval R8 + +#if defined(MD5_BLOCK_DATA_ORDER) +# if defined(ULTRASPARC) +# define LOAD lda +# define X(i) [%i1+i*4]%asi +# define md5_block md5_block_asm_data_order_aligned +# define ASI_PRIMARY_LITTLE 0x88 +# else +# error "MD5_BLOCK_DATA_ORDER is supported only on UltraSPARC!" +# endif +#else +# define LOAD ld +# define X(i) [%i1+i*4] +# define md5_block md5_block_asm_host_order +#endif + +.section ".text",#alloc,#execinstr +#if defined(__SUNPRO_C) && defined(__sparcv9) + /* They've said -xarch=v9 at command line */ + .register %g2,#scratch + .register %g3,#scratch +# define FRAME -192 +#else +# define FRAME -96 +#endif + +.align 32 + +.global md5_block +md5_block: + save %sp,FRAME,%sp + + ld [Dptr],D +#ifdef ASI_PRIMARY_LITTLE + mov %asi,%o7 ! How dare I? Well, I just do:-) +#else + nop +#endif + ld [Cptr],C +#ifdef ASI_PRIMARY_LITTLE + mov ASI_PRIMARY_LITTLE,%asi +#else + nop +#endif + ld [Bptr],B + nop + ld [Aptr],A + nop + LOAD X(0),R0 + nop + ba .Lmd5_block_loop + nop + +.align 32 +.Lmd5_block_loop: + +!!!!!!!!Round 0 + + xor C,D,T1 + sethi %hi(0xd76aa478),T2 + and T1,B,T1 + or T2,%lo(0xd76aa478),T2 != + xor T1,D,T1 + add T1,R0,T1 + LOAD X(1),R1 + add T1,T2,T1 != + add A,T1,A + sll A,7,T2 + srl A,32-7,A + or A,T2,A != + xor B,C,T1 + add A,B,A + + sethi %hi(0xe8c7b756),T2 + and T1,A,T1 != + or T2,%lo(0xe8c7b756),T2 + xor T1,C,T1 + LOAD X(2),R2 + add T1,R1,T1 != + add T1,T2,T1 + add D,T1,D + sll D,12,T2 + srl D,32-12,D != + or D,T2,D + xor A,B,T1 + add D,A,D + + sethi %hi(0x242070db),T2 != + and T1,D,T1 + or T2,%lo(0x242070db),T2 + xor T1,B,T1 + add T1,R2,T1 != + LOAD X(3),R3 + add T1,T2,T1 + add C,T1,C + sll C,17,T2 != + srl C,32-17,C + or C,T2,C + xor D,A,T1 + add C,D,C != + + sethi %hi(0xc1bdceee),T2 + and T1,C,T1 + or T2,%lo(0xc1bdceee),T2 + xor T1,A,T1 != + add T1,R3,T1 + LOAD X(4),R4 + add T1,T2,T1 + add B,T1,B != + sll B,22,T2 + srl B,32-22,B + or B,T2,B + xor C,D,T1 != + add B,C,B + + sethi %hi(0xf57c0faf),T2 + and T1,B,T1 + or T2,%lo(0xf57c0faf),T2 != + xor T1,D,T1 + add T1,R4,T1 + LOAD X(5),R5 + add T1,T2,T1 != + add A,T1,A + sll A,7,T2 + srl A,32-7,A + or A,T2,A != + xor B,C,T1 + add A,B,A + + sethi %hi(0x4787c62a),T2 + and T1,A,T1 != + or T2,%lo(0x4787c62a),T2 + xor T1,C,T1 + LOAD X(6),R6 + add T1,R5,T1 != + add T1,T2,T1 + add D,T1,D + sll D,12,T2 + srl D,32-12,D != + or D,T2,D + xor A,B,T1 + add D,A,D + + sethi %hi(0xa8304613),T2 != + and T1,D,T1 + or T2,%lo(0xa8304613),T2 + xor T1,B,T1 + add T1,R6,T1 != + LOAD X(7),R7 + add T1,T2,T1 + add C,T1,C + sll C,17,T2 != + srl C,32-17,C + or C,T2,C + xor D,A,T1 + add C,D,C != + + sethi %hi(0xfd469501),T2 + and T1,C,T1 + or T2,%lo(0xfd469501),T2 + xor T1,A,T1 != + add T1,R7,T1 + LOAD X(8),R8 + add T1,T2,T1 + add B,T1,B != + sll B,22,T2 + srl B,32-22,B + or B,T2,B + xor C,D,T1 != + add B,C,B + + sethi %hi(0x698098d8),T2 + and T1,B,T1 + or T2,%lo(0x698098d8),T2 != + xor T1,D,T1 + add T1,R8,T1 + LOAD X(9),R9 + add T1,T2,T1 != + add A,T1,A + sll A,7,T2 + srl A,32-7,A + or A,T2,A != + xor B,C,T1 + add A,B,A + + sethi %hi(0x8b44f7af),T2 + and T1,A,T1 != + or T2,%lo(0x8b44f7af),T2 + xor T1,C,T1 + LOAD X(10),R10 + add T1,R9,T1 != + add T1,T2,T1 + add D,T1,D + sll D,12,T2 + srl D,32-12,D != + or D,T2,D + xor A,B,T1 + add D,A,D + + sethi %hi(0xffff5bb1),T2 != + and T1,D,T1 + or T2,%lo(0xffff5bb1),T2 + xor T1,B,T1 + add T1,R10,T1 != + LOAD X(11),R11 + add T1,T2,T1 + add C,T1,C + sll C,17,T2 != + srl C,32-17,C + or C,T2,C + xor D,A,T1 + add C,D,C != + + sethi %hi(0x895cd7be),T2 + and T1,C,T1 + or T2,%lo(0x895cd7be),T2 + xor T1,A,T1 != + add T1,R11,T1 + LOAD X(12),R12 + add T1,T2,T1 + add B,T1,B != + sll B,22,T2 + srl B,32-22,B + or B,T2,B + xor C,D,T1 != + add B,C,B + + sethi %hi(0x6b901122),T2 + and T1,B,T1 + or T2,%lo(0x6b901122),T2 != + xor T1,D,T1 + add T1,R12,T1 + LOAD X(13),R13 + add T1,T2,T1 != + add A,T1,A + sll A,7,T2 + srl A,32-7,A + or A,T2,A != + xor B,C,T1 + add A,B,A + + sethi %hi(0xfd987193),T2 + and T1,A,T1 != + or T2,%lo(0xfd987193),T2 + xor T1,C,T1 + LOAD X(14),RX + add T1,R13,T1 != + add T1,T2,T1 + add D,T1,D + sll D,12,T2 + srl D,32-12,D != + or D,T2,D + xor A,B,T1 + add D,A,D + + sethi %hi(0xa679438e),T2 != + and T1,D,T1 + or T2,%lo(0xa679438e),T2 + xor T1,B,T1 + add T1,RX,T1 != + LOAD X(15),RX + add T1,T2,T1 + add C,T1,C + sll C,17,T2 != + srl C,32-17,C + or C,T2,C + xor D,A,T1 + add C,D,C != + + sethi %hi(0x49b40821),T2 + and T1,C,T1 + or T2,%lo(0x49b40821),T2 + xor T1,A,T1 != + add T1,RX,T1 + !pre-LOADed X(1),R1 + add T1,T2,T1 + add B,T1,B + sll B,22,T2 != + srl B,32-22,B + or B,T2,B + add B,C,B + +!!!!!!!!Round 1 + + xor B,C,T1 != + sethi %hi(0xf61e2562),T2 + and T1,D,T1 + or T2,%lo(0xf61e2562),T2 + xor T1,C,T1 != + add T1,R1,T1 + !pre-LOADed X(6),R6 + add T1,T2,T1 + add A,T1,A + sll A,5,T2 != + srl A,32-5,A + or A,T2,A + add A,B,A + + xor A,B,T1 != + sethi %hi(0xc040b340),T2 + and T1,C,T1 + or T2,%lo(0xc040b340),T2 + xor T1,B,T1 != + add T1,R6,T1 + !pre-LOADed X(11),R11 + add T1,T2,T1 + add D,T1,D + sll D,9,T2 != + srl D,32-9,D + or D,T2,D + add D,A,D + + xor D,A,T1 != + sethi %hi(0x265e5a51),T2 + and T1,B,T1 + or T2,%lo(0x265e5a51),T2 + xor T1,A,T1 != + add T1,R11,T1 + !pre-LOADed X(0),R0 + add T1,T2,T1 + add C,T1,C + sll C,14,T2 != + srl C,32-14,C + or C,T2,C + add C,D,C + + xor C,D,T1 != + sethi %hi(0xe9b6c7aa),T2 + and T1,A,T1 + or T2,%lo(0xe9b6c7aa),T2 + xor T1,D,T1 != + add T1,R0,T1 + !pre-LOADed X(5),R5 + add T1,T2,T1 + add B,T1,B + sll B,20,T2 != + srl B,32-20,B + or B,T2,B + add B,C,B + + xor B,C,T1 != + sethi %hi(0xd62f105d),T2 + and T1,D,T1 + or T2,%lo(0xd62f105d),T2 + xor T1,C,T1 != + add T1,R5,T1 + !pre-LOADed X(10),R10 + add T1,T2,T1 + add A,T1,A + sll A,5,T2 != + srl A,32-5,A + or A,T2,A + add A,B,A + + xor A,B,T1 != + sethi %hi(0x02441453),T2 + and T1,C,T1 + or T2,%lo(0x02441453),T2 + xor T1,B,T1 != + add T1,R10,T1 + LOAD X(15),RX + add T1,T2,T1 + add D,T1,D != + sll D,9,T2 + srl D,32-9,D + or D,T2,D + add D,A,D != + + xor D,A,T1 + sethi %hi(0xd8a1e681),T2 + and T1,B,T1 + or T2,%lo(0xd8a1e681),T2 != + xor T1,A,T1 + add T1,RX,T1 + !pre-LOADed X(4),R4 + add T1,T2,T1 + add C,T1,C != + sll C,14,T2 + srl C,32-14,C + or C,T2,C + add C,D,C != + + xor C,D,T1 + sethi %hi(0xe7d3fbc8),T2 + and T1,A,T1 + or T2,%lo(0xe7d3fbc8),T2 != + xor T1,D,T1 + add T1,R4,T1 + !pre-LOADed X(9),R9 + add T1,T2,T1 + add B,T1,B != + sll B,20,T2 + srl B,32-20,B + or B,T2,B + add B,C,B != + + xor B,C,T1 + sethi %hi(0x21e1cde6),T2 + and T1,D,T1 + or T2,%lo(0x21e1cde6),T2 != + xor T1,C,T1 + add T1,R9,T1 + LOAD X(14),RX + add T1,T2,T1 != + add A,T1,A + sll A,5,T2 + srl A,32-5,A + or A,T2,A != + add A,B,A + + xor A,B,T1 + sethi %hi(0xc33707d6),T2 + and T1,C,T1 != + or T2,%lo(0xc33707d6),T2 + xor T1,B,T1 + add T1,RX,T1 + !pre-LOADed X(3),R3 + add T1,T2,T1 != + add D,T1,D + sll D,9,T2 + srl D,32-9,D + or D,T2,D != + add D,A,D + + xor D,A,T1 + sethi %hi(0xf4d50d87),T2 + and T1,B,T1 != + or T2,%lo(0xf4d50d87),T2 + xor T1,A,T1 + add T1,R3,T1 + !pre-LOADed X(8),R8 + add T1,T2,T1 != + add C,T1,C + sll C,14,T2 + srl C,32-14,C + or C,T2,C != + add C,D,C + + xor C,D,T1 + sethi %hi(0x455a14ed),T2 + and T1,A,T1 != + or T2,%lo(0x455a14ed),T2 + xor T1,D,T1 + add T1,R8,T1 + !pre-LOADed X(13),R13 + add T1,T2,T1 != + add B,T1,B + sll B,20,T2 + srl B,32-20,B + or B,T2,B != + add B,C,B + + xor B,C,T1 + sethi %hi(0xa9e3e905),T2 + and T1,D,T1 != + or T2,%lo(0xa9e3e905),T2 + xor T1,C,T1 + add T1,R13,T1 + !pre-LOADed X(2),R2 + add T1,T2,T1 != + add A,T1,A + sll A,5,T2 + srl A,32-5,A + or A,T2,A != + add A,B,A + + xor A,B,T1 + sethi %hi(0xfcefa3f8),T2 + and T1,C,T1 != + or T2,%lo(0xfcefa3f8),T2 + xor T1,B,T1 + add T1,R2,T1 + !pre-LOADed X(7),R7 + add T1,T2,T1 != + add D,T1,D + sll D,9,T2 + srl D,32-9,D + or D,T2,D != + add D,A,D + + xor D,A,T1 + sethi %hi(0x676f02d9),T2 + and T1,B,T1 != + or T2,%lo(0x676f02d9),T2 + xor T1,A,T1 + add T1,R7,T1 + !pre-LOADed X(12),R12 + add T1,T2,T1 != + add C,T1,C + sll C,14,T2 + srl C,32-14,C + or C,T2,C != + add C,D,C + + xor C,D,T1 + sethi %hi(0x8d2a4c8a),T2 + and T1,A,T1 != + or T2,%lo(0x8d2a4c8a),T2 + xor T1,D,T1 + add T1,R12,T1 + !pre-LOADed X(5),R5 + add T1,T2,T1 != + add B,T1,B + sll B,20,T2 + srl B,32-20,B + or B,T2,B != + add B,C,B + +!!!!!!!!Round 2 + + xor B,C,T1 + sethi %hi(0xfffa3942),T2 + xor T1,D,T1 != + or T2,%lo(0xfffa3942),T2 + add T1,R5,T1 + !pre-LOADed X(8),R8 + add T1,T2,T1 + add A,T1,A != + sll A,4,T2 + srl A,32-4,A + or A,T2,A + add A,B,A != + + xor A,B,T1 + sethi %hi(0x8771f681),T2 + xor T1,C,T1 + or T2,%lo(0x8771f681),T2 != + add T1,R8,T1 + !pre-LOADed X(11),R11 + add T1,T2,T1 + add D,T1,D + sll D,11,T2 != + srl D,32-11,D + or D,T2,D + add D,A,D + + xor D,A,T1 != + sethi %hi(0x6d9d6122),T2 + xor T1,B,T1 + or T2,%lo(0x6d9d6122),T2 + add T1,R11,T1 != + LOAD X(14),RX + add T1,T2,T1 + add C,T1,C + sll C,16,T2 != + srl C,32-16,C + or C,T2,C + add C,D,C + + xor C,D,T1 != + sethi %hi(0xfde5380c),T2 + xor T1,A,T1 + or T2,%lo(0xfde5380c),T2 + add T1,RX,T1 != + !pre-LOADed X(1),R1 + add T1,T2,T1 + add B,T1,B + sll B,23,T2 + srl B,32-23,B != + or B,T2,B + add B,C,B + + xor B,C,T1 + sethi %hi(0xa4beea44),T2 != + xor T1,D,T1 + or T2,%lo(0xa4beea44),T2 + add T1,R1,T1 + !pre-LOADed X(4),R4 + add T1,T2,T1 != + add A,T1,A + sll A,4,T2 + srl A,32-4,A + or A,T2,A != + add A,B,A + + xor A,B,T1 + sethi %hi(0x4bdecfa9),T2 + xor T1,C,T1 != + or T2,%lo(0x4bdecfa9),T2 + add T1,R4,T1 + !pre-LOADed X(7),R7 + add T1,T2,T1 + add D,T1,D != + sll D,11,T2 + srl D,32-11,D + or D,T2,D + add D,A,D != + + xor D,A,T1 + sethi %hi(0xf6bb4b60),T2 + xor T1,B,T1 + or T2,%lo(0xf6bb4b60),T2 != + add T1,R7,T1 + !pre-LOADed X(10),R10 + add T1,T2,T1 + add C,T1,C + sll C,16,T2 != + srl C,32-16,C + or C,T2,C + add C,D,C + + xor C,D,T1 != + sethi %hi(0xbebfbc70),T2 + xor T1,A,T1 + or T2,%lo(0xbebfbc70),T2 + add T1,R10,T1 != + !pre-LOADed X(13),R13 + add T1,T2,T1 + add B,T1,B + sll B,23,T2 + srl B,32-23,B != + or B,T2,B + add B,C,B + + xor B,C,T1 + sethi %hi(0x289b7ec6),T2 != + xor T1,D,T1 + or T2,%lo(0x289b7ec6),T2 + add T1,R13,T1 + !pre-LOADed X(0),R0 + add T1,T2,T1 != + add A,T1,A + sll A,4,T2 + srl A,32-4,A + or A,T2,A != + add A,B,A + + xor A,B,T1 + sethi %hi(0xeaa127fa),T2 + xor T1,C,T1 != + or T2,%lo(0xeaa127fa),T2 + add T1,R0,T1 + !pre-LOADed X(3),R3 + add T1,T2,T1 + add D,T1,D != + sll D,11,T2 + srl D,32-11,D + or D,T2,D + add D,A,D != + + xor D,A,T1 + sethi %hi(0xd4ef3085),T2 + xor T1,B,T1 + or T2,%lo(0xd4ef3085),T2 != + add T1,R3,T1 + !pre-LOADed X(6),R6 + add T1,T2,T1 + add C,T1,C + sll C,16,T2 != + srl C,32-16,C + or C,T2,C + add C,D,C + + xor C,D,T1 != + sethi %hi(0x04881d05),T2 + xor T1,A,T1 + or T2,%lo(0x04881d05),T2 + add T1,R6,T1 != + !pre-LOADed X(9),R9 + add T1,T2,T1 + add B,T1,B + sll B,23,T2 + srl B,32-23,B != + or B,T2,B + add B,C,B + + xor B,C,T1 + sethi %hi(0xd9d4d039),T2 != + xor T1,D,T1 + or T2,%lo(0xd9d4d039),T2 + add T1,R9,T1 + !pre-LOADed X(12),R12 + add T1,T2,T1 != + add A,T1,A + sll A,4,T2 + srl A,32-4,A + or A,T2,A != + add A,B,A + + xor A,B,T1 + sethi %hi(0xe6db99e5),T2 + xor T1,C,T1 != + or T2,%lo(0xe6db99e5),T2 + add T1,R12,T1 + LOAD X(15),RX + add T1,T2,T1 != + add D,T1,D + sll D,11,T2 + srl D,32-11,D + or D,T2,D != + add D,A,D + + xor D,A,T1 + sethi %hi(0x1fa27cf8),T2 + xor T1,B,T1 != + or T2,%lo(0x1fa27cf8),T2 + add T1,RX,T1 + !pre-LOADed X(2),R2 + add T1,T2,T1 + add C,T1,C != + sll C,16,T2 + srl C,32-16,C + or C,T2,C + add C,D,C != + + xor C,D,T1 + sethi %hi(0xc4ac5665),T2 + xor T1,A,T1 + or T2,%lo(0xc4ac5665),T2 != + add T1,R2,T1 + !pre-LOADed X(0),R0 + add T1,T2,T1 + add B,T1,B + sll B,23,T2 != + srl B,32-23,B + or B,T2,B + add B,C,B + +!!!!!!!!Round 3 + + orn B,D,T1 != + sethi %hi(0xf4292244),T2 + xor T1,C,T1 + or T2,%lo(0xf4292244),T2 + add T1,R0,T1 != + !pre-LOADed X(7),R7 + add T1,T2,T1 + add A,T1,A + sll A,6,T2 + srl A,32-6,A != + or A,T2,A + add A,B,A + + orn A,C,T1 + sethi %hi(0x432aff97),T2 != + xor T1,B,T1 + or T2,%lo(0x432aff97),T2 + LOAD X(14),RX + add T1,R7,T1 != + add T1,T2,T1 + add D,T1,D + sll D,10,T2 + srl D,32-10,D != + or D,T2,D + add D,A,D + + orn D,B,T1 + sethi %hi(0xab9423a7),T2 != + xor T1,A,T1 + or T2,%lo(0xab9423a7),T2 + add T1,RX,T1 + !pre-LOADed X(5),R5 + add T1,T2,T1 != + add C,T1,C + sll C,15,T2 + srl C,32-15,C + or C,T2,C != + add C,D,C + + orn C,A,T1 + sethi %hi(0xfc93a039),T2 + xor T1,D,T1 != + or T2,%lo(0xfc93a039),T2 + add T1,R5,T1 + !pre-LOADed X(12),R12 + add T1,T2,T1 + add B,T1,B != + sll B,21,T2 + srl B,32-21,B + or B,T2,B + add B,C,B != + + orn B,D,T1 + sethi %hi(0x655b59c3),T2 + xor T1,C,T1 + or T2,%lo(0x655b59c3),T2 != + add T1,R12,T1 + !pre-LOADed X(3),R3 + add T1,T2,T1 + add A,T1,A + sll A,6,T2 != + srl A,32-6,A + or A,T2,A + add A,B,A + + orn A,C,T1 != + sethi %hi(0x8f0ccc92),T2 + xor T1,B,T1 + or T2,%lo(0x8f0ccc92),T2 + add T1,R3,T1 != + !pre-LOADed X(10),R10 + add T1,T2,T1 + add D,T1,D + sll D,10,T2 + srl D,32-10,D != + or D,T2,D + add D,A,D + + orn D,B,T1 + sethi %hi(0xffeff47d),T2 != + xor T1,A,T1 + or T2,%lo(0xffeff47d),T2 + add T1,R10,T1 + !pre-LOADed X(1),R1 + add T1,T2,T1 != + add C,T1,C + sll C,15,T2 + srl C,32-15,C + or C,T2,C != + add C,D,C + + orn C,A,T1 + sethi %hi(0x85845dd1),T2 + xor T1,D,T1 != + or T2,%lo(0x85845dd1),T2 + add T1,R1,T1 + !pre-LOADed X(8),R8 + add T1,T2,T1 + add B,T1,B != + sll B,21,T2 + srl B,32-21,B + or B,T2,B + add B,C,B != + + orn B,D,T1 + sethi %hi(0x6fa87e4f),T2 + xor T1,C,T1 + or T2,%lo(0x6fa87e4f),T2 != + add T1,R8,T1 + LOAD X(15),RX + add T1,T2,T1 + add A,T1,A != + sll A,6,T2 + srl A,32-6,A + or A,T2,A + add A,B,A != + + orn A,C,T1 + sethi %hi(0xfe2ce6e0),T2 + xor T1,B,T1 + or T2,%lo(0xfe2ce6e0),T2 != + add T1,RX,T1 + !pre-LOADed X(6),R6 + add T1,T2,T1 + add D,T1,D + sll D,10,T2 != + srl D,32-10,D + or D,T2,D + add D,A,D + + orn D,B,T1 != + sethi %hi(0xa3014314),T2 + xor T1,A,T1 + or T2,%lo(0xa3014314),T2 + add T1,R6,T1 != + !pre-LOADed X(13),R13 + add T1,T2,T1 + add C,T1,C + sll C,15,T2 + srl C,32-15,C != + or C,T2,C + add C,D,C + + orn C,A,T1 + sethi %hi(0x4e0811a1),T2 != + xor T1,D,T1 + or T2,%lo(0x4e0811a1),T2 + !pre-LOADed X(4),R4 + ld [Aptr],Aval + add T1,R13,T1 != + add T1,T2,T1 + add B,T1,B + sll B,21,T2 + srl B,32-21,B != + or B,T2,B + add B,C,B + + orn B,D,T1 + sethi %hi(0xf7537e82),T2 != + xor T1,C,T1 + or T2,%lo(0xf7537e82),T2 + !pre-LOADed X(11),R11 + ld [Dptr],Dval + add T1,R4,T1 != + add T1,T2,T1 + add A,T1,A + sll A,6,T2 + srl A,32-6,A != + or A,T2,A + add A,B,A + + orn A,C,T1 + sethi %hi(0xbd3af235),T2 != + xor T1,B,T1 + or T2,%lo(0xbd3af235),T2 + !pre-LOADed X(2),R2 + ld [Cptr],Cval + add T1,R11,T1 != + add T1,T2,T1 + add D,T1,D + sll D,10,T2 + srl D,32-10,D != + or D,T2,D + add D,A,D + + orn D,B,T1 + sethi %hi(0x2ad7d2bb),T2 != + xor T1,A,T1 + or T2,%lo(0x2ad7d2bb),T2 + !pre-LOADed X(9),R9 + ld [Bptr],Bval + add T1,R2,T1 != + add Aval,A,Aval + add T1,T2,T1 + st Aval,[Aptr] + add C,T1,C != + sll C,15,T2 + add Dval,D,Dval + srl C,32-15,C + or C,T2,C != + st Dval,[Dptr] + add C,D,C + + orn C,A,T1 + sethi %hi(0xeb86d391),T2 != + xor T1,D,T1 + or T2,%lo(0xeb86d391),T2 + add T1,R9,T1 + !pre-LOADed X(0),R0 + mov Aval,A != + add T1,T2,T1 + mov Dval,D + add B,T1,B + sll B,21,T2 != + add Cval,C,Cval + srl B,32-21,B + st Cval,[Cptr] + or B,T2,B != + add B,C,B + + deccc %i2 + mov Cval,C + add B,Bval,B != + inc 64,%i1 + nop + st B,[Bptr] + nop != + +#ifdef ULTRASPARC + bg,a,pt %icc,.Lmd5_block_loop +#else + bg,a .Lmd5_block_loop +#endif + LOAD X(0),R0 + +#ifdef ASI_PRIMARY_LITTLE + mov %o7,%asi +#endif + ret + restore %g0,0,%o0 + +.type md5_block,#function +.size md5_block,(.-md5_block) diff --git a/crypto/md5/md5.h b/crypto/md5/md5.h index 6e97fe1e4f..148fb963d4 100644 --- a/crypto/md5/md5.h +++ b/crypto/md5/md5.h @@ -67,23 +67,43 @@ extern "C" { #error MD5 is disabled. #endif +/* + * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + * ! MD5_LONG has to be at least 32 bits wide. If it's wider, then ! + * ! MD5_LONG_LOG2 has to be defined along. ! + * !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! + */ + +#if defined(WIN16) || defined(__LP32__) +#define MD5_LONG unsigned long +#elif defined(_CRAY) || defined(__ILP64__) +#define MD5_LONG unsigned long +#define MD5_LONG_LOG2 3 +/* + * _CRAY note. I could declare short, but I have no idea what impact + * does it have on performance on none-T3E machines. I could declare + * int, but at least on C90 sizeof(int) can be chosen at compile time. + * So I've chosen long... + * + */ +#else +#define MD5_LONG unsigned int +#endif + #define MD5_CBLOCK 64 -#define MD5_LBLOCK 16 -#define MD5_BLOCK 16 -#define MD5_LAST_BLOCK 56 -#define MD5_LENGTH_BLOCK 8 +#define MD5_LBLOCK (MD5_CBLOCK/4) #define MD5_DIGEST_LENGTH 16 typedef struct MD5state_st { - unsigned long A,B,C,D; - unsigned long Nl,Nh; - unsigned long data[MD5_LBLOCK]; + MD5_LONG A,B,C,D; + MD5_LONG Nl,Nh; + MD5_LONG data[MD5_LBLOCK]; int num; } MD5_CTX; void MD5_Init(MD5_CTX *c); -void MD5_Update(MD5_CTX *c, const void *data, unsigned long len); +void MD5_Update(MD5_CTX *c, const unsigned char *data, unsigned long len); void MD5_Final(unsigned char *md, MD5_CTX *c); unsigned char *MD5(unsigned char *d, unsigned long n, unsigned char *md); void MD5_Transform(MD5_CTX *c, unsigned char *b); diff --git a/crypto/md5/md5_dgst.c b/crypto/md5/md5_dgst.c index 2671b00172..830e04d02a 100644 --- a/crypto/md5/md5_dgst.c +++ b/crypto/md5/md5_dgst.c @@ -70,12 +70,6 @@ char *MD5_version="MD5" OPENSSL_VERSION_PTEXT; #define INIT_DATA_C (unsigned long)0x98badcfeL #define INIT_DATA_D (unsigned long)0x10325476L -# ifdef MD5_ASM - void md5_block_x86(MD5_CTX *c, unsigned long *p,int num); -# define md5_block md5_block_x86 -# else - static void md5_block(MD5_CTX *c, unsigned long *p,int num); -# endif void MD5_Init(MD5_CTX *c) { c->A=INIT_DATA_A; @@ -87,183 +81,31 @@ void MD5_Init(MD5_CTX *c) c->num=0; } -void MD5_Update(MD5_CTX *c, const void *_data, unsigned long len) +#ifndef md5_block_host_order +void md5_block_host_order (MD5_CTX *c, const MD5_LONG *X, int num) { - register const unsigned char *data=_data; - register ULONG *p; - int sw,sc; - ULONG l; - - if (len == 0) return; - - l=(c->Nl+(len<<3))&0xffffffffL; - /* 95-05-24 eay Fixed a bug with the overflow handling, thanks to - * Wei Dai for pointing it out. */ - if (l < c->Nl) /* overflow */ - c->Nh++; - c->Nh+=(len>>29); - c->Nl=l; - - if (c->num != 0) - { - p=c->data; - sw=c->num>>2; - sc=c->num&0x03; - - if ((c->num+len) >= MD5_CBLOCK) - { - l= p[sw]; - p_c2l(data,l,sc); - p[sw++]=l; - for (; swnum); - - md5_block(c,p,64); - c->num=0; - /* drop through and do the rest */ - } - else - { - int ew,ec; - - c->num+=(int)len; - if ((sc+len) < 4) /* ugly, add char's to a word */ - { - l= p[sw]; - p_c2l_p(data,l,sc,len); - p[sw]=l; - } - else - { - ew=(c->num>>2); - ec=(c->num&0x03); - l= p[sw]; - p_c2l(data,l,sc); - p[sw++]=l; - for (; sw < ew; sw++) - { c2l(data,l); p[sw]=l; } - if (ec) - { - c2l_p(data,l,ec); - p[sw]=l; - } - } - return; - } - } - /* we now can process the input data in blocks of MD5_CBLOCK - * chars and save the leftovers to c->data. */ -#ifdef L_ENDIAN - if ((((unsigned long)data)%sizeof(ULONG)) == 0) - { - sw=(int)len/MD5_CBLOCK; - if (sw > 0) - { - sw*=MD5_CBLOCK; - md5_block(c,(ULONG *)data,sw); - data+=sw; - len-=sw; - } - } -#endif - p=c->data; - while (len >= MD5_CBLOCK) - { -#if defined(L_ENDIAN) || defined(B_ENDIAN) - if (p != (unsigned long *)data) - memcpy(p,data,MD5_CBLOCK); - data+=MD5_CBLOCK; -#ifdef B_ENDIAN - for (sw=(MD5_LBLOCK/4); sw; sw--) - { - Endian_Reverse32(p[0]); - Endian_Reverse32(p[1]); - Endian_Reverse32(p[2]); - Endian_Reverse32(p[3]); - p+=4; - } -#endif -#else - for (sw=(MD5_LBLOCK/4); sw; sw--) - { - c2l(data,l); *(p++)=l; - c2l(data,l); *(p++)=l; - c2l(data,l); *(p++)=l; - c2l(data,l); *(p++)=l; - } -#endif - p=c->data; - md5_block(c,p,64); - len-=MD5_CBLOCK; - } - sc=(int)len; - c->num=sc; - if (sc) - { - sw=sc>>2; /* words to copy */ -#ifdef L_ENDIAN - p[sw]=0; - memcpy(p,data,sc); -#else - sc&=0x03; - for ( ; sw; sw--) - { c2l(data,l); *(p++)=l; } - c2l_p(data,l,sc); - *p=l; -#endif - } - } - -void MD5_Transform(MD5_CTX *c, unsigned char *b) - { - ULONG p[16]; -#if !defined(L_ENDIAN) - ULONG *q; - int i; -#endif - -#if defined(B_ENDIAN) || defined(L_ENDIAN) - memcpy(p,b,64); -#ifdef B_ENDIAN - q=p; - for (i=(MD5_LBLOCK/4); i; i--) - { - Endian_Reverse32(q[0]); - Endian_Reverse32(q[1]); - Endian_Reverse32(q[2]); - Endian_Reverse32(q[3]); - q+=4; - } -#endif -#else - q=p; - for (i=(MD5_LBLOCK/4); i; i--) - { - ULONG l; - c2l(b,l); *(q++)=l; - c2l(b,l); *(q++)=l; - c2l(b,l); *(q++)=l; - c2l(b,l); *(q++)=l; - } -#endif - md5_block(c,p,64); - } - -#ifndef MD5_ASM - -static void md5_block(MD5_CTX *c, register ULONG *X, int num) - { - register ULONG A,B,C,D; + register unsigned long A,B,C,D; + /* + * In case you wonder why A-D are declared as long and not + * as MD5_LONG. Doing so results in slight performance + * boost on LP64 architectures. The catch is we don't + * really care if 32 MSBs of a 64-bit register get polluted + * with eventual overflows as we *save* only 32 LSBs in + * *either* case. Now declaring 'em long excuses the compiler + * from keeping 32 MSBs zeroed resulting in 13% performance + * improvement under SPARC Solaris7/64 and 5% under AlphaLinux. + * Well, to be honest it should say that this *prevents* + * performance degradation. + * + * + */ A=c->A; B=c->B; C=c->C; D=c->D; - for (;;) + + for (;num--;X+=HASH_LBLOCK) { /* Round 0 */ R0(A,B,C,D,X[ 0], 7,0xd76aa478L); @@ -334,74 +176,127 @@ static void md5_block(MD5_CTX *c, register ULONG *X, int num) R3(C,D,A,B,X[ 2],15,0x2ad7d2bbL); R3(B,C,D,A,X[ 9],21,0xeb86d391L); - A+=c->A&0xffffffffL; - B+=c->B&0xffffffffL; - c->A=A; - c->B=B; - C+=c->C&0xffffffffL; - D+=c->D&0xffffffffL; - c->C=C; - c->D=D; - X+=16; - num-=64; - if (num <= 0) break; + A = c->A += A; + B = c->B += B; + C = c->C += C; + D = c->D += D; } } #endif -void MD5_Final(unsigned char *md, MD5_CTX *c) +#ifndef md5_block_data_order +void md5_block_data_order (MD5_CTX *c, const unsigned char *data, int num) { - register int i,j; - register ULONG l; - register ULONG *p; - static unsigned char end[4]={0x80,0x00,0x00,0x00}; - unsigned char *cp=end; + register unsigned long A,B,C,D,l; + /* + * In case you wonder why A-D are declared as long and not + * as MD5_LONG. Doing so results in slight performance + * boost on LP64 architectures. The catch is we don't + * really care if 32 MSBs of a 64-bit register get polluted + * with eventual overflows as we *save* only 32 LSBs in + * *either* case. Now declaring 'em long excuses the compiler + * from keeping 32 MSBs zeroed resulting in 13% performance + * improvement under SPARC Solaris7/64 and 5% under AlphaLinux. + * Well, to be honest it should say that this *prevents* + * performance degradation. + * + * + */ + MD5_LONG X[MD5_LBLOCK]; + /* + * In case you wonder why don't I use c->data for this. + * RISCs usually have a handful of registers and if X is + * declared as automatic array good optimizing compiler + * shall accomodate at least part of it in register bank + * instead of memory. + * + * + */ - /* c->num should definitly have room for at least one more byte. */ - p=c->data; - j=c->num; - i=j>>2; + A=c->A; + B=c->B; + C=c->C; + D=c->D; - /* purify often complains about the following line as an - * Uninitialized Memory Read. While this can be true, the - * following p_c2l macro will reset l when that case is true. - * This is because j&0x03 contains the number of 'valid' bytes - * already in p[i]. If and only if j&0x03 == 0, the UMR will - * occur but this is also the only time p_c2l will do - * l= *(cp++) instead of l|= *(cp++) - * Many thanks to Alex Tang for pickup this - * 'potential bug' */ -#ifdef PURIFY - if ((j&0x03) == 0) p[i]=0; -#endif - l=p[i]; - p_c2l(cp,l,j&0x03); - p[i]=l; - i++; - /* i is the next 'undefined word' */ - if (c->num >= MD5_LAST_BLOCK) + for (;num--;) { - for (; iNl; - p[MD5_LBLOCK-1]=c->Nh; - md5_block(c,p,64); - cp=md; - l=c->A; l2c(l,cp); - l=c->B; l2c(l,cp); - l=c->C; l2c(l,cp); - l=c->D; l2c(l,cp); + HOST_c2l(data,l); X[ 0]=l; HOST_c2l(data,l); X[ 1]=l; + /* Round 0 */ + R0(A,B,C,D,X[ 0], 7,0xd76aa478L); HOST_c2l(data,l); X[ 2]=l; + R0(D,A,B,C,X[ 1],12,0xe8c7b756L); HOST_c2l(data,l); X[ 3]=l; + R0(C,D,A,B,X[ 2],17,0x242070dbL); HOST_c2l(data,l); X[ 4]=l; + R0(B,C,D,A,X[ 3],22,0xc1bdceeeL); HOST_c2l(data,l); X[ 5]=l; + R0(A,B,C,D,X[ 4], 7,0xf57c0fafL); HOST_c2l(data,l); X[ 6]=l; + R0(D,A,B,C,X[ 5],12,0x4787c62aL); HOST_c2l(data,l); X[ 7]=l; + R0(C,D,A,B,X[ 6],17,0xa8304613L); HOST_c2l(data,l); X[ 8]=l; + R0(B,C,D,A,X[ 7],22,0xfd469501L); HOST_c2l(data,l); X[ 9]=l; + R0(A,B,C,D,X[ 8], 7,0x698098d8L); HOST_c2l(data,l); X[10]=l; + R0(D,A,B,C,X[ 9],12,0x8b44f7afL); HOST_c2l(data,l); X[11]=l; + R0(C,D,A,B,X[10],17,0xffff5bb1L); HOST_c2l(data,l); X[12]=l; + R0(B,C,D,A,X[11],22,0x895cd7beL); HOST_c2l(data,l); X[13]=l; + R0(A,B,C,D,X[12], 7,0x6b901122L); HOST_c2l(data,l); X[14]=l; + R0(D,A,B,C,X[13],12,0xfd987193L); HOST_c2l(data,l); X[15]=l; + R0(C,D,A,B,X[14],17,0xa679438eL); + R0(B,C,D,A,X[15],22,0x49b40821L); + /* Round 1 */ + R1(A,B,C,D,X[ 1], 5,0xf61e2562L); + R1(D,A,B,C,X[ 6], 9,0xc040b340L); + R1(C,D,A,B,X[11],14,0x265e5a51L); + R1(B,C,D,A,X[ 0],20,0xe9b6c7aaL); + R1(A,B,C,D,X[ 5], 5,0xd62f105dL); + R1(D,A,B,C,X[10], 9,0x02441453L); + R1(C,D,A,B,X[15],14,0xd8a1e681L); + R1(B,C,D,A,X[ 4],20,0xe7d3fbc8L); + R1(A,B,C,D,X[ 9], 5,0x21e1cde6L); + R1(D,A,B,C,X[14], 9,0xc33707d6L); + R1(C,D,A,B,X[ 3],14,0xf4d50d87L); + R1(B,C,D,A,X[ 8],20,0x455a14edL); + R1(A,B,C,D,X[13], 5,0xa9e3e905L); + R1(D,A,B,C,X[ 2], 9,0xfcefa3f8L); + R1(C,D,A,B,X[ 7],14,0x676f02d9L); + R1(B,C,D,A,X[12],20,0x8d2a4c8aL); + /* Round 2 */ + R2(A,B,C,D,X[ 5], 4,0xfffa3942L); + R2(D,A,B,C,X[ 8],11,0x8771f681L); + R2(C,D,A,B,X[11],16,0x6d9d6122L); + R2(B,C,D,A,X[14],23,0xfde5380cL); + R2(A,B,C,D,X[ 1], 4,0xa4beea44L); + R2(D,A,B,C,X[ 4],11,0x4bdecfa9L); + R2(C,D,A,B,X[ 7],16,0xf6bb4b60L); + R2(B,C,D,A,X[10],23,0xbebfbc70L); + R2(A,B,C,D,X[13], 4,0x289b7ec6L); + R2(D,A,B,C,X[ 0],11,0xeaa127faL); + R2(C,D,A,B,X[ 3],16,0xd4ef3085L); + R2(B,C,D,A,X[ 6],23,0x04881d05L); + R2(A,B,C,D,X[ 9], 4,0xd9d4d039L); + R2(D,A,B,C,X[12],11,0xe6db99e5L); + R2(C,D,A,B,X[15],16,0x1fa27cf8L); + R2(B,C,D,A,X[ 2],23,0xc4ac5665L); + /* Round 3 */ + R3(A,B,C,D,X[ 0], 6,0xf4292244L); + R3(D,A,B,C,X[ 7],10,0x432aff97L); + R3(C,D,A,B,X[14],15,0xab9423a7L); + R3(B,C,D,A,X[ 5],21,0xfc93a039L); + R3(A,B,C,D,X[12], 6,0x655b59c3L); + R3(D,A,B,C,X[ 3],10,0x8f0ccc92L); + R3(C,D,A,B,X[10],15,0xffeff47dL); + R3(B,C,D,A,X[ 1],21,0x85845dd1L); + R3(A,B,C,D,X[ 8], 6,0x6fa87e4fL); + R3(D,A,B,C,X[15],10,0xfe2ce6e0L); + R3(C,D,A,B,X[ 6],15,0xa3014314L); + R3(B,C,D,A,X[13],21,0x4e0811a1L); + R3(A,B,C,D,X[ 4], 6,0xf7537e82L); + R3(D,A,B,C,X[11],10,0xbd3af235L); + R3(C,D,A,B,X[ 2],15,0x2ad7d2bbL); + R3(B,C,D,A,X[ 9],21,0xeb86d391L); - /* clear stuff, md5_block may be leaving some stuff on the stack - * but I'm not worried :-) */ - c->num=0; -/* memset((char *)&c,0,sizeof(c));*/ + A = c->A += A; + B = c->B += B; + C = c->C += C; + D = c->D += D; + } } +#endif #ifdef undef int printit(unsigned long *l) diff --git a/crypto/md5/md5_locl.h b/crypto/md5/md5_locl.h index fe7397a495..56708ba8ea 100644 --- a/crypto/md5/md5_locl.h +++ b/crypto/md5/md5_locl.h @@ -56,98 +56,79 @@ * [including the GNU Public Licence.] */ -/* On sparc, this actually slows things down :-( */ -#if defined(sun) -#undef B_ENDIAN -#endif - #include #include #include -#define ULONG unsigned long -#define UCHAR unsigned char -#define UINT unsigned int - -#undef c2l -#define c2l(c,l) (l = ((unsigned long)(*((c)++))) , \ - l|=(((unsigned long)(*((c)++)))<< 8), \ - l|=(((unsigned long)(*((c)++)))<<16), \ - l|=(((unsigned long)(*((c)++)))<<24)) +#ifndef MD5_LONG_LOG2 +#define MD5_LONG_LOG2 2 /* default to 32 bits */ +#endif -#undef p_c2l -#define p_c2l(c,l,n) { \ - switch (n) { \ - case 0: l =((unsigned long)(*((c)++))); \ - case 1: l|=((unsigned long)(*((c)++)))<< 8; \ - case 2: l|=((unsigned long)(*((c)++)))<<16; \ - case 3: l|=((unsigned long)(*((c)++)))<<24; \ - } \ - } +#ifdef MD5_ASM +# if defined(__i386) || defined(WIN32) +# define md5_block_host_order md5_block_asm_host_order +# elif defined(__sparc) && defined(ULTRASPARC) + void md5_block_asm_data_order_aligned (MD5_CTX *c, const MD5_LONG *p,int num); +# define HASH_BLOCK_DATA_ORDER_ALIGNED md5_block_asm_data_order_aligned +# endif +#endif -/* NOTE the pointer is not incremented at the end of this */ -#undef c2l_p -#define c2l_p(c,l,n) { \ - l=0; \ - (c)+=n; \ - switch (n) { \ - case 3: l =((unsigned long)(*(--(c))))<<16; \ - case 2: l|=((unsigned long)(*(--(c))))<< 8; \ - case 1: l|=((unsigned long)(*(--(c)))) ; \ - } \ - } +void md5_block_host_order (MD5_CTX *c, const MD5_LONG *p,int num); +void md5_block_data_order (MD5_CTX *c, const unsigned char *p,int num); -#undef p_c2l_p -#define p_c2l_p(c,l,sc,len) { \ - switch (sc) \ - { \ - case 0: l =((unsigned long)(*((c)++))); \ - if (--len == 0) break; \ - case 1: l|=((unsigned long)(*((c)++)))<< 8; \ - if (--len == 0) break; \ - case 2: l|=((unsigned long)(*((c)++)))<<16; \ - } \ - } +#if defined(__i386) +/* + * *_block_host_order is expected to handle aligned data while + * *_block_data_order - unaligned. As algorithm and host (x86) + * are in this case of the same "endianess" these two are + * otherwise indistinguishable. But normally you don't want to + * call the same function because unaligned access in places + * where alignment is expected is usually a "Bad Thing". Indeed, + * on RISCs you get punished with BUS ERROR signal or *severe* + * performance degradation. Intel CPUs are in turn perfectly + * capable of loading unaligned data without such drastic side + * effect. Yes, they say it's slower than aligned load, but no + * exception is generated and therefore performance degradation + * is *incomparable* with RISCs. What we should weight here is + * costs of unaligned access against costs of aligning data. + * According to my measurements allowing unaligned access results + * in ~9% performance improvement on Pentium II operating at + * 266MHz. I won't be surprised if the difference will be higher + * on faster systems:-) + * + * + */ +#define md5_block_data_order md5_block_host_order +#endif -#undef l2c -#define l2c(l,c) (*((c)++)=(unsigned char)(((l) )&0xff), \ - *((c)++)=(unsigned char)(((l)>> 8)&0xff), \ - *((c)++)=(unsigned char)(((l)>>16)&0xff), \ - *((c)++)=(unsigned char)(((l)>>24)&0xff)) +#define DATA_ORDER_IS_LITTLE_ENDIAN + +#define HASH_LONG MD5_LONG +#define HASH_LONG_LOG2 MD5_LONG_LOG2 +#define HASH_CTX MD5_CTX +#define HASH_CBLOCK MD5_CBLOCK +#define HASH_LBLOCK MD5_LBLOCK +#define HASH_UPDATE MD5_Update +#define HASH_TRANSFORM MD5_Transform +#define HASH_FINAL MD5_Final +#define HASH_BLOCK_HOST_ORDER md5_block_host_order +#if defined(B_ENDIAN) || defined(md5_block_data_order) +#define HASH_BLOCK_DATA_ORDER md5_block_data_order +/* + * Little-endians (Intel and Alpha) feel better without this. + * It looks like memcpy does better job than generic + * md5_block_data_order on copying-n-aligning input data. + * But franlky speaking I didn't expect such result on Alpha. + * On the other hand I've got this with egcs-1.0.2 and if + * program is compiled with another (better?) compiler it + * might turn out other way around. + * + * + */ +#endif -/* NOTE - c is not incremented as per l2c */ -#undef l2cn -#define l2cn(l1,l2,c,n) { \ - c+=n; \ - switch (n) { \ - case 8: *(--(c))=(unsigned char)(((l2)>>24)&0xff); \ - case 7: *(--(c))=(unsigned char)(((l2)>>16)&0xff); \ - case 6: *(--(c))=(unsigned char)(((l2)>> 8)&0xff); \ - case 5: *(--(c))=(unsigned char)(((l2) )&0xff); \ - case 4: *(--(c))=(unsigned char)(((l1)>>24)&0xff); \ - case 3: *(--(c))=(unsigned char)(((l1)>>16)&0xff); \ - case 2: *(--(c))=(unsigned char)(((l1)>> 8)&0xff); \ - case 1: *(--(c))=(unsigned char)(((l1) )&0xff); \ - } \ - } +#include "../md32_common.h" -/* A nice byte order reversal from Wei Dai */ -#if defined(WIN32) -/* 5 instructions with rotate instruction, else 9 */ -#define Endian_Reverse32(a) \ - { \ - unsigned long l=(a); \ - (a)=((ROTATE(l,8)&0x00FF00FF)|(ROTATE(l,24)&0xFF00FF00)); \ - } -#else -/* 6 instructions with rotate instruction, else 8 */ -#define Endian_Reverse32(a) \ - { \ - unsigned long l=(a); \ - l=(((l&0xFF00FF00)>>8L)|((l&0x00FF00FF)<<8L)); \ - (a)=ROTATE(l,16L); \ - } -#endif /* #define F(x,y,z) (((x) & (y)) | ((~(x)) & (z))) #define G(x,y,z) (((x) & (z)) | ((y) & (~(z)))) @@ -162,14 +143,6 @@ #define H(b,c,d) ((b) ^ (c) ^ (d)) #define I(b,c,d) (((~(d)) | (b)) ^ (c)) -#undef ROTATE -#if defined(WIN32) -#define ROTATE(a,n) _lrotl(a,n) -#else -#define ROTATE(a,n) (((a)<<(n))|(((a)&0xffffffff)>>(32-(n)))) -#endif - - #define R0(a,b,c,d,k,s,t) { \ a+=((k)+(t)+F((b),(c),(d))); \ a=ROTATE(a,s); \ diff --git a/crypto/md5/md5_one.c b/crypto/md5/md5_one.c index c761401160..c98721f4d8 100644 --- a/crypto/md5/md5_one.c +++ b/crypto/md5/md5_one.c @@ -57,7 +57,8 @@ */ #include -#include "md5_locl.h" +#include +#include unsigned char *MD5(unsigned char *d, unsigned long n, unsigned char *md) { -- 2.25.1