From b2dba9bf1f8f73376b9c1f0904a86996c728b236 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Mon, 14 May 2007 21:35:25 +0000 Subject: [PATCH] Profiling revealed that OPENSSL_cleanse consumes *more* CPU time than sha1_block_data_order when hashing short messages. Move OPENSSL_cleanse to "cpuid" assembler module and gain 2x. --- Configure | 3 ++- crypto/Makefile | 2 +- crypto/ia64cpuid.S | 36 ++++++++++++++++++++++++++++++++ crypto/mem.c | 10 +++++++-- crypto/sparccpuid.S | 48 +++++++++++++++++++++++++++++++++++++++++++ crypto/x86_64cpuid.pl | 32 +++++++++++++++++++++++++++++ crypto/x86cpuid.pl | 31 ++++++++++++++++++++++++++++ 7 files changed, 158 insertions(+), 4 deletions(-) diff --git a/Configure b/Configure index 9b210953a5..b072bfbc7d 100755 --- a/Configure +++ b/Configure @@ -1209,6 +1209,7 @@ $cflags.=" -DOPENSSL_IA32_SSE2" if (!$no_sse2 && $bn_obj =~ /bn86/); $cflags.=" -DOPENSSL_BN_ASM_MONT" if ($bn_obj =~ /\-mont|mo86\-/); +$cpuid_obj="mem_clr.o" unless ($cpuid_obj =~ /\.o$/); $des_obj=$des_enc unless ($des_obj =~ /\.o$/); $bf_obj=$bf_enc unless ($bf_obj =~ /\.o$/); $cast_obj=$cast_enc unless ($cast_obj =~ /\.o$/); @@ -1481,7 +1482,7 @@ print OUT "#ifdef OPENSSL_ALGORITHM_DEFINES\n"; print OUT $openssl_algorithm_defines_trans; print OUT "#endif\n\n"; -print OUT "#define OPENSSL_CPUID_OBJ\n\n" if ($cpuid_obj); +print OUT "#define OPENSSL_CPUID_OBJ\n\n" if ($cpuid_obj ne "mem_clr.o"); while () { diff --git a/crypto/Makefile b/crypto/Makefile index efe6a79d87..1b8c7c2591 100644 --- a/crypto/Makefile +++ b/crypto/Makefile @@ -34,7 +34,7 @@ GENERAL=Makefile README crypto-lib.com install.com LIB= $(TOP)/libcrypto.a SHARED_LIB= libcrypto$(SHLIB_EXT) LIBSRC= cryptlib.c mem.c mem_clr.c mem_dbg.c cversion.c ex_data.c cpt_err.c ebcdic.c uid.c o_time.c o_str.c o_dir.c -LIBOBJ= cryptlib.o mem.o mem_clr.o mem_dbg.o cversion.o ex_data.o cpt_err.o ebcdic.o uid.o o_time.o o_str.o o_dir.o $(CPUID_OBJ) +LIBOBJ= cryptlib.o mem.o mem_dbg.o cversion.o ex_data.o cpt_err.o ebcdic.o uid.o o_time.o o_str.o o_dir.o $(CPUID_OBJ) SRC= $(LIBSRC) diff --git a/crypto/ia64cpuid.S b/crypto/ia64cpuid.S index 5836565abb..818e2d1e1d 100644 --- a/crypto/ia64cpuid.S +++ b/crypto/ia64cpuid.S @@ -1,11 +1,13 @@ // Works on all IA-64 platforms: Linux, HP-UX, Win64i... // On Win64i compile with ias.exe. .text + .global OPENSSL_cpuid_setup# .proc OPENSSL_cpuid_setup# OPENSSL_cpuid_setup: { .mib; br.ret.sptk.many b0 };; .endp OPENSSL_cpuid_setup# + .global OPENSSL_rdtsc# .proc OPENSSL_rdtsc# OPENSSL_rdtsc: @@ -124,3 +126,37 @@ OPENSSL_wipe_cpu: mov ar.lc=r3 br.ret.sptk b0 };; .endp OPENSSL_wipe_cpu# + +.global OPENSSL_cleanse# +.proc OPENSSL_cleanse# +OPENSSL_cleanse: +{ .mib; and r2=7,r32 + cmp.leu p6,p0=15,r33 // len>=15 +(p6) br.cond.dptk .Lot };; + +.Little: +{ .mib; st1 [r32]=r0,1 + cmp.ltu p6,p7=1,r33 } // len>1 +{ .mbb; add r33=-1,r33 // len-- +(p6) br.cond.dptk .Little +(p7) br.ret.sptk.many b0 };; + +.Lot: +{ .mib; cmp.eq p6,p0=0,r2 +(p6) br.cond.dptk .Laligned };; +{ .mmi; st1 [r32]=r0,1;; + and r2=7,r32 } +{ .mib; add r33=-1,r33 + br .Lot };; + +.Laligned: +{ .mmi; st8 [r32]=r0,8 + and r2=-8,r33 // len&~7 + add r33=-8,r33 };; // len-=8 +{ .mib; cmp.ltu p6,p0=8,r2 // ((len+8)&~7)>8 +(p6) br.cond.dptk .Laligned };; + +{ .mbb; cmp.eq p6,p7=r0,r33 +(p7) br.cond.dpnt .Little +(p6) br.ret.sptk.many b0 };; +.endp OPENSSL_cleanse# diff --git a/crypto/mem.c b/crypto/mem.c index 6635167228..43d48ab425 100644 --- a/crypto/mem.c +++ b/crypto/mem.c @@ -250,7 +250,6 @@ void CRYPTO_get_mem_debug_functions(void (**m)(void *,int,const char *,int,int), void *CRYPTO_malloc_locked(int num, const char *file, int line) { void *ret = NULL; - extern unsigned char cleanse_ctr; if (num <= 0) return NULL; @@ -267,11 +266,15 @@ void *CRYPTO_malloc_locked(int num, const char *file, int line) if (malloc_debug_func != NULL) malloc_debug_func(ret, num, file, line, 1); +#ifndef OPENSSL_CPUID_OBJ /* Create a dependency on the value of 'cleanse_ctr' so our memory * sanitisation function can't be optimised out. NB: We only do * this for >2Kb so the overhead doesn't bother us. */ if(ret && (num > 2048)) + { extern unsigned char cleanse_ctr; ((unsigned char *)ret)[0] = cleanse_ctr; + } +#endif return ret; } @@ -291,7 +294,6 @@ void CRYPTO_free_locked(void *str) void *CRYPTO_malloc(int num, const char *file, int line) { void *ret = NULL; - extern unsigned char cleanse_ctr; if (num <= 0) return NULL; @@ -308,11 +310,15 @@ void *CRYPTO_malloc(int num, const char *file, int line) if (malloc_debug_func != NULL) malloc_debug_func(ret, num, file, line, 1); +#ifndef OPENSSL_CPUID_OBJ /* Create a dependency on the value of 'cleanse_ctr' so our memory * sanitisation function can't be optimised out. NB: We only do * this for >2Kb so the overhead doesn't bother us. */ if(ret && (num > 2048)) + { extern unsigned char cleanse_ctr; ((unsigned char *)ret)[0] = cleanse_ctr; + } +#endif return ret; } diff --git a/crypto/sparccpuid.S b/crypto/sparccpuid.S index 52308abca6..f691abc57f 100644 --- a/crypto/sparccpuid.S +++ b/crypto/sparccpuid.S @@ -232,6 +232,54 @@ _sparcv9_rdtick: .type _sparcv9_rdtick,#function .size _sparcv9_rdtick,.-_sparcv9_rdtick +.global OPENSSL_cleanse +.align 32 +OPENSSL_cleanse: + cmp %o1,6 + nop +#ifdef ABI64 + bgu %xcc,.Lot +#else + bgu .Lot +#endif + nop + +.Little: + stb %g0,[%o0] + subcc %o1,1,%o1 + bnz .Little + add %o0,1,%o0 + retl + nop +.align 32 +.Lot: + andcc %o0,3,%g0 + bz .Laligned + nop + stb %g0,[%o0] + sub %o1,1,%o1 + ba .Lot + add %o0,1,%o0 + nop +.Laligned: + st %g0,[%o0] + sub %o1,4,%o1 + andcc %o1,-4,%g0 +#ifdef ABI64 + bnz %xcc,.Laligned +#else + bnz .Laligned +#endif + add %o0,4,%o0 + + cmp %o1,0 + bne .Little + nop + retl + nop +.type OPENSSL_cleanse,#function +.size OPENSSL_cleanse,.-OPENSSL_cleanse + .section ".init",#alloc,#execinstr call OPENSSL_cpuid_setup nop diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl index bc06e99cfb..2f657ca9d8 100644 --- a/crypto/x86_64cpuid.pl +++ b/crypto/x86_64cpuid.pl @@ -155,4 +155,36 @@ OPENSSL_ia32_cpuid: or %rcx,%rax ret .size OPENSSL_ia32_cpuid,.-OPENSSL_ia32_cpuid + +.globl OPENSSL_cleanse +.type OPENSSL_cleanse,\@function,2 +.align 16 +OPENSSL_cleanse: + xor %rax,%rax + cmp \$15,%rsi + jae .Lot +.Little: + mov %al,(%rdi) + sub \$1,%rsi + lea 1(%rdi),%rdi + jnz .Little + ret +.align 16 +.Lot: + test \$7,%rdi + jz .Laligned + mov %al,(%rdi) + lea -1(%rsi),%rsi + lea 1(%rdi),%rdi + jmp .Lot +.Laligned: + mov %rax,(%rdi) + lea -8(%rsi),%rsi + test \$-8,%rsi + lea 8(%rdi),%rdi + jnz .Laligned + cmp \$0,%rsi + jne .Little + ret +.size OPENSSL_cleanse,.-OPENSSL_cleanse ___ diff --git a/crypto/x86cpuid.pl b/crypto/x86cpuid.pl index 7d924a60b7..13828d5633 100644 --- a/crypto/x86cpuid.pl +++ b/crypto/x86cpuid.pl @@ -216,6 +216,37 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } } &function_end_B("OPENSSL_indirect_call"); +&function_begin_B("OPENSSL_cleanse"); + &mov ("edx",&wparam(0)); + &mov ("ecx",&wparam(1)); + &xor ("eax","eax"); + &cmp ("ecx",7); + &jae (&label("lot")); +&set_label("little"); + &mov (&BP(0,"edx"),"al"); + &sub ("ecx",1); + &lea ("edx",&DWP(1,"edx")); + &jnz (&label("little")); + &ret (); + +&set_label("lot",16); + &test ("edx",3); + &jz (&label("aligned")); + &mov (&BP(0,"edx"),"al"); + &lea ("ecx",&DWP(-1,"ecx")); + &lea ("edx",&DWP(1,"edx")); + &jmp (&label("lot")); +&set_label("aligned"); + &mov (&DWP(0,"edx"),"eax"); + &lea ("ecx",&DWP(-4,"ecx")); + &test ("ecx",-4); + &lea ("edx",&DWP(4,"edx")); + &jnz (&label("aligned")); + &cmp ("ecx",0); + &jne (&label("little")); + &ret (); +&function_end_B("OPENSSL_cleanse"); + &initseg("OPENSSL_cpuid_setup"); &asm_finish(); -- 2.25.1